#### Merging data from Crunchbase and US Spending

In [2]:
pip install fuzzywuzzy

Collecting fuzzywuzzyNote: you may need to restart the kernel to use updated packages.

  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [6]:
pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.25.1-cp38-cp38-win_amd64.whl.metadata (3.4 kB)
Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein)
  Downloading rapidfuzz-3.9.7-cp38-cp38-win_amd64.whl.metadata (12 kB)
Downloading Levenshtein-0.25.1-cp38-cp38-win_amd64.whl (98 kB)
Downloading rapidfuzz-3.9.7-cp38-cp38-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------------------------- -- 1.6/1.7 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 6.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.25.1 rapidfuzz-3.9.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process

**Load the Crunchbase data**:

In [2]:
# load the cleaned Crunchbase data
cb_data = pd.read_csv('C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/cleaned_crunchbase_data.csv')

# display the first few rows and data info to verify
print(cb_data.head())
print(cb_data.info())

           company_name  year_founded  \
0                DIMAAG          2018   
1               Netdata          2018   
2  Outward Technologies          2017   
3                TATINE          2015   
4                   X3M          2022   

                                          Industries  \
0  Artificial Intelligence (AI), Autonomous Vehic...   
1  Enterprise Software, Information Technology, I...   
2                       Aerospace, Commercial, Solar   
3   E-Commerce, Home Decor, Manufacturing, Wholesale   
4  Advertising, Advertising Platforms, Informatio...   

                                         hq  zipcode  \
0        Fremont, California, United States    94538   
1  San Francisco, California, United States    94104   
2       Broomfield, Colorado, United States    80020   
3          Chicago, Illinois, United States    60639   
4       Wilmington, Delaware, United States    19801   

                                         description operating_status  \
0  Bui

**Load the US Spending data**:

In [3]:
# load the cleaned USA Spending data
usa_spending = pd.read_csv('C:/Users/AM/OneDrive/Desktop/capstone/data/cleaned/cleaned_usa_spending_data.csv')

# display the first few rows and data info to verify
print(usa_spending.head())
print(usa_spending.info())

                   contract_award_unique_key  award_id_piid  \
0  CONT_AWD_80NSSC20C0664_8000_-NONE-_-NONE-  80NSSC20C0664   
1  CONT_AWD_80NSSC20C0151_8000_-NONE-_-NONE-  80NSSC20C0151   
2  CONT_AWD_80NSSC19C0347_8000_-NONE-_-NONE-  80NSSC19C0347   
3  CONT_AWD_80NSSC18C0024_8000_-NONE-_-NONE-  80NSSC18C0024   
4  CONT_AWD_80NSSC18C0093_8000_-NONE-_-NONE-  80NSSC18C0093   

   total_obligated_amount  current_total_value_of_award  \
0                122531.0                      122531.0   
1                741929.0                      741929.0   
2                121417.0                      121417.0   
3               1348131.0                      948134.0   
4                749999.0                      749999.0   

   potential_total_value_of_award award_base_action_date  \
0                        122531.0             2020-08-28   
1                        741929.0             2020-06-27   
2                        121417.0             2019-08-19   
3                        9

**Merging**

In [9]:
def optimized_levenshtein_merge(df1, df2, key1, key2, threshold=90):
    # create sets of unique company names
    names1 = set(df1[key1].unique())
    names2 = set(df2[key2].unique())
    
    # find matches for names in df1
    matches = {}
    for name in names1:
        match = process.extractOne(name, names2, score_cutoff=threshold)
        if match:
            matches[name] = match
    
    # create a mapping dictionary
    mapping = {name: match[0] for name, match in matches.items()}
    
    # apply the mapping to df1
    df1['match_company'] = df1[key1].map(mapping)
    df1['match_score'] = df1[key1].map(lambda x: matches.get(x, (None, None))[1])
    
    return df1

In [10]:
# perform the merge 
merged_data = optimized_levenshtein_merge(cb_data, usa_spending, 'clean_name', 'clean_recipient_name')

In [11]:
# display results
print(merged_data[['clean_name', 'match_company', 'match_score']].head(20))
print(f"Total matches found: {merged_data['match_company'].notna().sum()}")

                   clean_name match_company  match_score
0                      DIMAAG           NaN          NaN
1                     NETDATA           NaN          NaN
2        OUTWARD TECHNOLOGIES           NaN          NaN
3                      TATINE           NaN          NaN
4                         X3M           NaN          NaN
5                       DUEXT           NaN          NaN
6                       PEMRS           NaN          NaN
7       SOLSTAR SPACE COMPANY           NaN          NaN
8                      BLAKFX           NaN          NaN
9              REACTIVE SPACE           NaN          NaN
10                   SPACE K9           NaN          NaN
11          BESTMARK NATIONAL           NaN          NaN
12          APHELION ORBITALS           NaN          NaN
13            HEALTHHIVE, PBC           NaN          NaN
14                     EDEKEE           NaN          NaN
15           BEARLY SOLUTIONS           NaN          NaN
16       HUDSON SPACE SYSTEMS  

In [16]:
successful_matches = merged_data[merged_data['match_company'].notna()]
print(successful_matches[['clean_name', 'match_company', 'match_score']].head(20))

                  clean_name                match_company  match_score
20                   DIGITAL  TELEDYNE DIGITAL IMAGING US         90.0
21   BENCHMARK SPACE SYSTEMS      BENCHMARK SPACE SYSTEMS        100.0
59                    RADIAN   RADIANT ANALYTIC SOLUTIONS         90.0
62             SPACE FOUNDRY                SPACE FOUNDRY        100.0
74             LUNAR OUTPOST                LUNAR OUTPOST        100.0
75      APOGEE SEMICONDUCTOR         APOGEE SEMICONDUCTOR        100.0
80                      NOVI                         NOVI        100.0
83                  INVINOVO                          NVI         90.0
111              STOKE SPACE     STOKE SPACE TECHNOLOGIES         90.0
115                 NFLUX AI                        NFLUX         90.0
118                  G-SPACE                       GSPACE         92.0
132               WHYTESPACE                       ESPACE         90.0
133                  REDWIRE      REDWIRE SPACE SOLUTIONS         90.0
136   

In [20]:
# print column names for Crunchbase data
print("Crunchbase Data Columns:")
print(cb_data.columns.tolist())

# print column names for USA Spending data
print("\nUSA Spending Data Columns:")
print(usa_spending.columns.tolist())

Crunchbase Data Columns:
['company_name', 'year_founded', 'Industries', 'hq', 'zipcode', 'description', 'operating_status', 'cb_rank', 'funding_status', 'funding_date', 'funding_type', 'ipo_status', 'estimated_revenue', 'num_employees', 'num_funding_rounds', 'total_funding', 'top_investors', 'num_investors', 'operating_status_binary', 'num_employees_cat', 'clean_name', 'city', 'state', 'country', 'match_company', 'match_score']

USA Spending Data Columns:
['contract_award_unique_key', 'award_id_piid', 'total_obligated_amount', 'current_total_value_of_award', 'potential_total_value_of_award', 'award_base_action_date', 'award_base_action_year', 'current_end_date', 'awarding_agency_name', 'awarding_sub_agency_name', 'funding_agency_name', 'funding_sub_agency_name', 'recipient_name', 'recipient_parent_name', 'recipient_city_name', 'recipient_state_code', 'recipient_zipcode', 'primary_place_of_performance_city_name', 'primary_place_of_performance_state_code', 'award_type', 'type_of_contract

In [22]:
# renaming certain columns based on findings
merged_data = merged_data.rename(columns={
    'hq': 'cb_hq',  # Headquarters
    'Industries': 'industries' #lowercase
})

In [29]:
def review_matches(successful_matches):
    verified_matches = []
    for _, row in successful_matches.iterrows():
        print(f"\nCrunchbase: {row['clean_name']} ({row['city']})")
        print(f"USA Spending: {row['match_company']}")
        print(f"Match Score: {row['match_score']}")
        keep = input("Keep this match? (y/n): ").lower().strip()
        if keep == 'y':
            verified_matches.append(row)
    return pd.DataFrame(verified_matches)

verified_matches = review_matches(successful_matches)


Crunchbase: DIGITAL (Greenwich)
USA Spending: TELEDYNE DIGITAL IMAGING US
Match Score: 90.0

Crunchbase: BENCHMARK SPACE SYSTEMS (South Burlington)
USA Spending: BENCHMARK SPACE SYSTEMS
Match Score: 100.0

Crunchbase: RADIAN (Bellevue)
USA Spending: RADIANT ANALYTIC SOLUTIONS
Match Score: 90.0

Crunchbase: SPACE FOUNDRY (San Jose)
USA Spending: SPACE FOUNDRY
Match Score: 100.0

Crunchbase: LUNAR OUTPOST (Golden)
USA Spending: LUNAR OUTPOST
Match Score: 100.0

Crunchbase: APOGEE SEMICONDUCTOR (Plano)
USA Spending: APOGEE SEMICONDUCTOR
Match Score: 100.0

Crunchbase: NOVI (Arlington)
USA Spending: NOVI
Match Score: 100.0

Crunchbase: INVINOVO (San Diego)
USA Spending: NVI
Match Score: 90.0

Crunchbase: STOKE SPACE (Kent)
USA Spending: STOKE SPACE TECHNOLOGIES
Match Score: 90.0

Crunchbase: NFLUX AI (Los Angeles)
USA Spending: NFLUX
Match Score: 90.0

Crunchbase: G-SPACE (Sunnyvale)
USA Spending: GSPACE
Match Score: 92.0

Crunchbase: WHYTESPACE (Chicago)
USA Spending: ESPACE
Match Score:

In [30]:
# update the merged dataset with verified matches
merged_data.loc[verified_matches.index, 'match_company'] = verified_matches['match_company']
merged_data.loc[verified_matches.index, 'match_score'] = verified_matches['match_score']

# remove unverified matches
merged_data.loc[~merged_data.index.isin(verified_matches.index), ['match_company', 'match_score']] = np.nan

print(f"Total verified matches: {len(verified_matches)}")

Total verified matches: 50


In [32]:
# Filter to keep only rows with valid matches
matched_data = merged_data[merged_data['match_company'].notna()]

# Save the filtered DataFrame to a new CSV file
matched_data.to_csv('matched_data.csv', index=False)

print("Matched data saved as 'matched_data.csv'.")

Matched data saved as 'matched_data.csv'.
