In [1]:
import pandas as pd

In [2]:
# load the datasets
all_funding = pd.read_csv('C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/all_funding_for_tableau.csv')
crunchbase_data = pd.read_csv('C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/cleaned_crunchbase_data.csv')
usa_spending = pd.read_csv('C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/cleaned_usa_spending_data.csv')

In [6]:
# standardize columns for matching
crunchbase_data = crunchbase_data.rename(columns={'clean_name': 's_company_name', 'state': 's_state_code'})
usa_spending = usa_spending.rename(columns={'clean_recipient_name': 'tc_company_name', 'recipient_state_code': 'tc_state_code'})

Merge dataframes to bring geographic data into all_funding

In [12]:
print("all_funding columns:", all_funding.columns.tolist())
print("crunchbase_data columns:", crunchbase_data.columns.tolist())
print("usa_spending columns:", usa_spending.columns.tolist())

all_funding columns: ['year', 'company_name', 'amount', 'category']
crunchbase_data columns: ['company_name', 'year_founded', 'Industries', 'hq', 'zipcode', 'description', 'operating_status', 'cb_rank', 'funding_status', 'funding_date', 'funding_type', 'ipo_status', 'estimated_revenue', 'num_employees', 'num_funding_rounds', 'total_funding', 'top_investors', 'num_investors', 'operating_status_binary', 'num_employees_cat', 's_company_name', 'city', 'S_state_code', 'country']
usa_spending columns: ['contract_award_unique_key', 'award_id_piid', 'total_obligated_amount', 'current_total_value_of_award', 'potential_total_value_of_award', 'award_base_action_date', 'award_base_action_year', 'current_end_date', 'awarding_agency_name', 'awarding_sub_agency_name', 'funding_agency_name', 'funding_sub_agency_name', 'recipient_name', 'recipient_parent_name', 'recipient_city_name', 'tc_state_code', 'recipient_zipcode', 'primary_place_of_performance_city_name', 'primary_place_of_performance_state_code

In [13]:
# rename columns for consistency
crunchbase_data = crunchbase_data.rename(columns={'S_state_code': 's_state_code'})
usa_spending = usa_spending.rename(columns={'tc_company_name': 'company_name', 'tc_state_code': 'state_code'})

In [14]:
# merge all_funding with crunchbase_data
all_funding = all_funding.merge(
    crunchbase_data[['s_company_name', 's_state_code', 'year_founded']], 
    left_on=['company_name', 'year'], 
    right_on=['s_company_name', 'year_founded'], 
    how='left'
)

In [15]:
# merge with usa_spending
all_funding = all_funding.merge(
    usa_spending[['company_name', 'state_code', 'award_base_action_year']], 
    left_on=['company_name', 'year'], 
    right_on=['company_name', 'award_base_action_year'], 
    how='left'
)

In [16]:
# create a new state_code column
all_funding['state_code'] = all_funding['s_state_code'].fillna(all_funding['state_code'])

# drop unnecessary columns
all_funding = all_funding.drop(['s_company_name', 'year_founded', 's_state_code', 'award_base_action_year'], axis=1)

# check the result
print(all_funding.head())
print(all_funding['state_code'].nunique())
print(all_funding['state_code'].isnull().sum())

   year         company_name      amount                 category state_code
0  2020  3D ARRAY TECHNOLOGY   124994.00  Traditional Contractors         CT
1  2020             3DSENSIR   124536.00  Traditional Contractors         CA
2  2021         3I3SIGNATURE        0.00                 Startups    Florida
3  2016    4D TECH SOLUTIONS  3998527.52  Traditional Contractors         WV
4  2018    4D TECH SOLUTIONS   746352.00  Traditional Contractors         WV
92
538


In [20]:
def standardize_state_codes(df):
    # dictionary to map full state names to codes
    state_to_code = {
        "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
        "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
        "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
        "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
        "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS",
        "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH",
        "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC",
        "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA",
        "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN",
        "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
        "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"
    }
    
    # function to replace full state name with code
    def replace_state(state):
        if state in state_to_code:
            return state_to_code[state]
        return state  # If it's already a code or not found, return as is
    
    # apply the replacement function to the state_code column
    df['state_code'] = df['state_code'].apply(replace_state)
    
    return df

In [21]:
# apply the function to your dataframe
all_funding = standardize_state_codes(all_funding)

# check the result
print(all_funding.head())
print(all_funding['state_code'].nunique())
print(all_funding['state_code'].isnull().sum())

   year         company_name      amount                 category state_code
0  2020  3D ARRAY TECHNOLOGY   124994.00  Traditional Contractors         CT
1  2020             3DSENSIR   124536.00  Traditional Contractors         CA
2  2021         3I3SIGNATURE        0.00                 Startups         FL
3  2016    4D TECH SOLUTIONS  3998527.52  Traditional Contractors         WV
4  2018    4D TECH SOLUTIONS   746352.00  Traditional Contractors         WV
52
538


In [23]:
# save the updated dataframe
all_funding.to_csv('C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/updated_all_funding_for_tableau.csv', index=False)

In [24]:
import os
import shutil

# define file paths
old_file = 'C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/all_funding_for_tableau.csv'
new_file = 'C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/updated_all_funding_for_tableau.csv'

# remove the old file
if os.path.exists(old_file):
    os.remove(old_file)

# rename the new file to the old file's name
os.rename(new_file, old_file)

print("File updated successfully!")

File updated successfully!
