# Predicting YCombinator Startup Performance

### Load dataset

In [48]:
import pandas as pd
import os
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np



In [49]:
badges = pd.read_csv('/Users/siddarvind/Downloads/archive-7/badges.csv')
companies = pd.read_csv('/Users/siddarvind/Downloads/archive-7/companies.csv')
founders = pd.read_csv('/Users/siddarvind/Downloads/archive-7/founders.csv')
industries = pd.read_csv('/Users/siddarvind/Downloads/archive-7/industries.csv')
prior_companies = pd.read_csv('/Users/siddarvind/Downloads/archive-7/prior_companies.csv')
regions = pd.read_csv('/Users/siddarvind/Downloads/archive-7/regions.csv')
schools = pd.read_csv('/Users/siddarvind/Downloads/archive-7/schools.csv')
tags = pd.read_csv('/Users/siddarvind/Downloads/archive-7/tags.csv')

badges['badge_num'] = badges.groupby('id').cumcount() + 1
df_pivot = badges.pivot(index='id', columns='badge_num', values='badge')
df_pivot.columns = [f'badge_{int(col)}' for col in df_pivot.columns]
badges = df_pivot.reset_index()

industries['industry_num'] = industries.groupby('id').cumcount() + 1
df_pivot = industries.pivot(index='id', columns='industry_num', values='industry')
df_pivot.columns = [f'industry_{int(col)}' for col in df_pivot.columns]
industries = df_pivot.reset_index()

tags['tag_num'] = tags.groupby('id').cumcount() + 1
df_pivot = tags.pivot(index='id', columns='tag_num', values='tag')
df_pivot.columns = [f'tag_{int(col)}' for col in df_pivot.columns]
tags = df_pivot.reset_index()

schools = schools.drop_duplicates(subset='hnid', keep='first')
ivy_league_schools = [
    'Harvard University',
    'Yale University',
    'Princeton University',
    'Columbia University',
    'University of Pennsylvania',
    'Dartmouth College',
    'Brown University',
    'Cornell University'
]

#TODO why all false?
def is_ivy_league(school_name):
    # Strip whitespace and use fuzzy matching
    school_name = str(school_name)
    school_name = school_name.strip().lower()
    # Get the best match and its score
    best_match, score = process.extractOne(school_name, ivy_league_schools, scorer=fuzz.ratio)
    return score >= 80

schools['is_ivy_league'] = schools['school'].apply(is_ivy_league)
faang_companies = [
    'Facebook',
    'Apple',
    'Meta',
    'Amazon',
    'Netflix',
    'Microsoft',
    'Google'
]
def is_faang(company_name):
    company_name = str(company_name)
    company_name = company_name.strip().lower()
    best_match, score = process.extractOne(company_name, faang_companies, scorer=fuzz.ratio)
    return score >= 80  # Threshold for matching

prior_companies['is_faang'] = prior_companies['company'].apply(is_faang)

founder_dfs = [founders, schools, prior_companies]
company_dfs = [badges, companies, industries, regions, tags]

for i, df in enumerate(company_dfs):
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)

merged_df = company_dfs[0]

for i, df in enumerate(company_dfs[1:], start=1):
    merged_df = pd.merge(merged_df, df, on='id', how='inner')

merged_df.rename(columns={'slug': 'company_slug'}, inplace=True)
#merged_df = pd.merge(merged_df, founders, on='company_slug', how='inner')
#merged_df = pd.merge(merged_df, schools, on='hnid', how='inner')
#merged_df = pd.merge(merged_df, prior_companies, on='hnid', how='inner')

merged_df




Unnamed: 0,id,badge_1,badge_2,badge_3,badge_4,name,company_slug,website,smallLogoUrl,oneLiner,...,industry_2,region,country,address,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6
0,5,isHiring,,,,CircuitHub,circuithub,https://circuithub.com,https://bookface-images.s3.amazonaws.com/small...,Electronics factory automation,...,Manufacturing and Robotics,Europe,United Kingdom,"London, UK",Hard Tech,Hardware,Robotics,,,
1,8,topCompany,highlightWomen,,,PlanGrid,plangrid,http://plangrid.com,https://bookface-images.s3.amazonaws.com/small...,Mobile applications for the construction indus...,...,Construction,America / Canada,United States of America,"San Francisco, CA, USA",Construction,,,,,
2,10,highlightWomen,,,,The Muse,the-muse,http://themuse.com,https://bookface-images.s3.amazonaws.com/small...,The Muse is values-based job search & hiring f...,...,Recruiting and Talent,America / Canada,United States of America,"New York, NY, USA",Marketplace,SaaS,Recruiting,,,
3,11,highlightBlack,,,,SendHub,sendhub,http://sendhub.com,https://bookface-images.s3.amazonaws.com/small...,Simple Business SMS. SMBs to Large Enterprise.,...,Productivity,America / Canada,United States of America,"Pleasanton, CA, USA",Messaging,Enterprise,SMB,,,
4,19,isHiring,,,,Rescale,rescale,https://rescale.com,https://bookface-images.s3.amazonaws.com/small...,High Performance Computing Built for the Cloud,...,"Engineering, Product and Design",America / Canada,United States of America,"San Francisco, CA, USA",Cloud Computing,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978,29964,isHiring,,,,Overlap,overlap,https://www.joinoverlap.com/,https://bookface-images.s3.amazonaws.com/small...,Overlap is Perplexity for video & audio content,...,Content,America / Canada,United States of America,"San Francisco, CA, USA",Consumer,Media,Podcasts,AI,,
1979,29967,highlightBlack,,,,Village Labs,village-labs,https://villagelabs.app/,https://bookface-images.s3.amazonaws.com/small...,Helping America's businesses become employee o...,...,Human Resources,America / Canada,United States of America,"New York, NY, USA",Fintech,Human Resources,,,,
1980,29972,highlightBlack,,,,Affil.ai,affil-ai,https://affil.ai/,https://bookface-images.s3.amazonaws.com/small...,AI Affiliate Network For Financial Companies,...,Marketing,America / Canada,United States of America,"San Francisco, CA, USA",SaaS,B2B,Compliance,Marketing,AI,
1981,29977,highlightWomen,,,,Merlin AI,merlin-ai,https://www.merlinai.co,https://bookface-images.s3.amazonaws.com/small...,AI powered ERP for Construction,...,,America / Canada,United States of America,"Los Angeles, CA, USA",SaaS,Construction,B2B,Enterprise Software,AI,


### Removing the duplicates

In [50]:
def choose_row(group):
    na_counts = group.isna().sum(axis=1)
    return group.loc[na_counts.idxmin()]

merged_df = merged_df.groupby('name').apply(choose_row).reset_index(drop=True)
merged_df

  merged_df = merged_df.groupby('name').apply(choose_row).reset_index(drop=True)


Unnamed: 0,id,badge_1,badge_2,badge_3,badge_4,name,company_slug,website,smallLogoUrl,oneLiner,...,industry_2,region,country,address,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6
0,29798,highlightBlack,,,,&AI,ai-2,https://www.tryandai.com/,https://bookface-images.s3.amazonaws.com/small...,The secure patent due diligence and management...,...,Legal,America / Canada,United States of America,"San Francisco, CA, USA",SaaS,B2B,LegalTech,,,
1,1700,highlightWomen,,,,10 By 10,10-by-10,https://10by10.io,https://bookface-images.s3.amazonaws.com/small...,10by10 is a unique marketplace in the recruiti...,...,Recruiting and Talent,America / Canada,United States of America,"San Francisco, CA, USA",Artificial Intelligence,Marketplace,Recruiting,,,
2,12286,highlightBlack,highlightWomen,,,1910 Genetics,1910-genetics,http://1910genetics.com,https://bookface-images.s3.amazonaws.com/small...,Make undruggable targets a thing of the past,...,Therapeutics,America / Canada,United States of America,"Cambridge, MA, USA",AI-powered Drug Discovery,,,,,
3,27925,isHiring,,,,222,222,https://222.place,https://bookface-images.s3.amazonaws.com/small...,marketplace facilitating IRL social experience...,...,,America / Canada,United States of America,"New York, NY, USA",Machine Learning,Marketplace,Consumer,Social,AI,
4,12252,highlightBlack,,,,54Gene,54gene,http://www.54gene.com,https://bookface-images.s3.amazonaws.com/small...,Precision medicine for Africans and the global...,...,Consumer Health and Wellness,Africa,Nigeria,"Lagos, Nigeria",Genomics,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1960,25691,isHiring,,,,voize,voize,https://voize.de/,https://bookface-images.s3.amazonaws.com/small...,App for healthcare workers to fill out forms u...,...,Productivity,Europe,Germany,"Berlin, Germany",Healthcare,,,,,
1961,21906,highlightWomen,,,,webapp.io,webapp-io,https://webapp.io,https://bookface-images.s3.amazonaws.com/small...,Build webapps faster with 10x better CI/CD + p...,...,"Engineering, Product and Design",America / Canada,Canada,"Toronto, ON, Canada",Developer Tools,SaaS,B2B,,,
1962,22856,isHiring,,,,weweb.io,weweb-io,https://www.weweb.io/,https://bookface-images.s3.amazonaws.com/small...,WeWeb is a no-code front-end builder that conn...,...,Operations,Europe,France,"Paris, France",Developer Tools,Web Development,,,,
1963,25977,highlightWomen,,,,yhangry,yhangry,http://www.yhangry.com,https://bookface-images.s3.amazonaws.com/small...,Marketplace for private chefs,...,Food and Beverage,Europe,United Kingdom,"London, UK",Marketplace,,,,,


### Create variables

In [51]:
def choose_row(group):
    na_counts = group.isna().sum(axis=1)
    return group.loc[na_counts.idxmin()]

merged_df = merged_df.groupby('name').apply(choose_row).reset_index(drop=True)

print("DataFrame after removing duplicates with the most NaNs:")
print(merged_df)

DataFrame after removing duplicates with the most NaNs:
         id          badge_1         badge_2 badge_3 badge_4           name  \
0     29798   highlightBlack             NaN     NaN     NaN            &AI   
1      1700   highlightWomen             NaN     NaN     NaN       10 By 10   
2     12286   highlightBlack  highlightWomen     NaN     NaN  1910 Genetics   
3     27925         isHiring             NaN     NaN     NaN            222   
4     12252   highlightBlack             NaN     NaN     NaN         54Gene   
...     ...              ...             ...     ...     ...            ...   
1960  25691         isHiring             NaN     NaN     NaN          voize   
1961  21906   highlightWomen             NaN     NaN     NaN      webapp.io   
1962  22856         isHiring             NaN     NaN     NaN       weweb.io   
1963  25977   highlightWomen             NaN     NaN     NaN        yhangry   
1964  29532  highlightLatinx             NaN     NaN     NaN    Ångström AI

  merged_df = merged_df.groupby('name').apply(choose_row).reset_index(drop=True)


In [69]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
import pandas as pd

#Take the TFA and gets the number value
def getTotalFundingAmount(str):
    arr = str.split(' ')
    pattern = r'^\$\d+(\.\d+)?[a-zA-Z]$'
    for elem in arr:
        if(re.fullmatch(pattern, elem)):
            return elem[1:len(elem)]

#get number from a string
def getNum(str):
    match = re.search(r'\d+(\.\d+)?', str)
    if match:
        return match.group(0) 
    return None

# Set up Chrome options
options = Options()
options.add_experimental_option("detach", True)

# Initialize Web Driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

columns = ['Company', 'TFA', 'Funding Rounds', 'Investors', 'Lead Investors']
df = pd.DataFrame(columns=columns)

companies = ['54gene', 'weweb'] #For testing

# Navigate to the URL
for company in merged_df['company_slug']: #if testing, change merged_df['company_slug] to companies array
    driver.get(f"https://www.crunchbase.com/organization/{company}/company_financials")

    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'It appears our system has mistaken you for a bot')]")))
        print(f"Cloudflare protection detected for {company}. Skipping...")
        continue 
    except:
        pass

    try:
        TFA=None
        funding_rounds=None
        l_investors=None
        investors=None
        # Locate the "Funding Rounds" label and its number
        right_divs = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'one-of-many-section') and contains(@class, 'ng-star-inserted')]"))
        )
        print(len(right_divs))
        first_div = right_divs[0]
        right_div = right_divs[1]
        new_text = right_div.text.replace('\n', ' ')
        print(new_text.split(' '))
        print(f'TFA: {getTotalFundingAmount(new_text)}')
        TFA = getTotalFundingAmount(new_text)
        print('')
        boxes = first_div.find_elements(By.XPATH, ".//div[@class='spacer ng-star-inserted']")
        for box in boxes:
            print(box.text)
            txt = box.text
            if "Funding Rounds" in txt:
                funding_rounds = getNum(txt)
            elif "Lead Investors" in txt:
                l_investors = getNum(txt)
            elif "Investors" in txt:
                investors = getNum(txt)
        new_row = pd.DataFrame({'Company': [company], 'TFA': [TFA], 'Funding Rounds': [funding_rounds], 'Lead Investors': [l_investors], 'Investors': [investors]})
        df = pd.concat([df, new_row], ignore_index=True)

    except Exception as e:
        print(f"An error occurred: {e}")
        continue

driver.quit()


An error occurred: Message: 

2
['Funding', '10by10.io', 'has', 'raised', 'a', 'total', 'of', '$120K', 'in', 'funding', 'over', '1', 'round.', 'This', 'was', 'a', 'Seed', 'round', 'raised', 'on', 'Aug', '22,', '2017.', '10by10.io', 'is', 'funded', 'by', 'Y', 'Combinator.', 'UNLOCK', 'FOR', 'FREE']
TFA: 120K

Funding Rounds 
1
Investors 
1
2
['Funding', '1910', 'Genetics', 'has', 'raised', 'a', 'total', 'of', '$26.7M', 'in', 'funding', 'over', '4', 'rounds.', 'Their', 'latest', 'funding', 'was', 'raised', 'on', 'Sep', '30,', '2021', 'from', 'a', 'Grant', 'round.', '1910', 'Genetics', 'is', 'funded', 'by', '10', 'investors.', 'National', 'Institute', 'of', 'Neurological', 'Disorders', 'and', 'Stroke', 'and', 'Playground', 'Global', 'are', 'the', 'most', 'recent', 'investors.', 'UNLOCK', 'FOR', 'FREE']
TFA: 26.7M

Funding Rounds 
4
Lead Investors 
3
Investors 
10
2
['Funding', '222', 'has', 'raised', 'a', 'total', 'of', '$3.6M', 'in', 'funding', 'over', '2', 'rounds.', 'Their', 'latest', 

In [68]:
df
#for company in merged_df['company_slug']:
#    print(company)

Unnamed: 0,Company,TFA,Funding Rounds,Investors,Lead Investors
0,10-by-10,120K,1,1,
1,1910-genetics,26.7M,4,10,3
2,222,3.6M,2,16,2
3,54gene,94.7M,9,24,3
4,8vdx,,,,
...,...,...,...,...,...
76,aragorn-ai,500K,1,2,
77,archform,8M,3,2,1
78,arcus,,,,
79,ardis-ai,150K,1,2,1
