## Python Crunchbase Classifier Ver 2.0

This classifier identifies: 

1. unique values in the Industries column of standard Crunchbase CSV-formatted files as well as whether how many times a particular value appears in the Industries column. 
2. Countries in the Headquarters Location column as well as how many times a particulat country appears in it.
3. The frequency of founding year / date company was founded
4. Statistical data on Total Funding Amount (in USD) 
5. Organisations and their corresponding descriptions

*NEW WITH VERSION 2.0* 

The classifier also generates 3 bar graphs displaying startups according to:
1. Top 10 industries
2. Location
3. Founding Year 

Take note that you'll only be able to download datasets of up to <b> 1000 values </b> with our current version of Crunchbase.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# set up plotting aesthetics
%matplotlib inline
sns.set_theme(style="whitegrid")

# 2. Load the raw data
df = pd.read_csv('crunchbase_export.csv', na_values=['', 'NA', 'None'])
print("Initial shape:", df.shape)
df.head()

# 3. Quick dtype & missing‐value overview
print(df.dtypes)
print("\nMissing values per column:")
print(df.isna().mean().sort_values(ascending=False).head(10))


# 4. Drop duplicates & obviously blank rows
df = df.drop_duplicates().reset_index(drop=True)

# 5. Parse dates and extract features
for col in ['founded_on', 'first_funding_on', 'last_funding_on']:
    df[col] = pd.to_datetime(df[col], errors='coerce')
df['founding_year'] = df['founded_on'].dt.year
df['funding_age_years'] = (df['last_funding_on'] - df['first_funding_on']).dt.days / 365

# 6. Impute or drop columns with >60% missing
high_na = df.isna().mean().loc[lambda x: x>0.6].index
df = df.drop(columns=high_na)
print("Dropped high-NA cols:", list(high_na))

# 7. Fill remaining numeric NAs with median
num_cols = df.select_dtypes(include='number').columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# 8. Fill categorical NAs with "Unknown"
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = df[col].fillna('Unknown')


# 9. Distribution of companies by founding year
plt.figure(figsize=(12,4))
sns.histplot(df['founding_year'].dropna(), bins=30, kde=False)
plt.title("Number of Companies by Founding Year")
plt.xlabel("Year")
plt.ylabel("Count")

# 10. Top 15 Industries
plt.figure(figsize=(12,5))
top_ind = df['industry_group'].value_counts().head(15)
sns.barplot(x=top_ind.values, y=top_ind.index, palette="viridis")
plt.title("Top 15 Industry Groups")
plt.xlabel("Number of Companies")

# 11. Funding age vs. employee count
plt.figure(figsize=(6,6))
sns.scatterplot(
    data=df, 
    x='funding_age_years', 
    y='num_employees',
    alpha=0.5
)
plt.xlim(0, df['funding_age_years'].quantile(0.95))
plt.ylim(0, df['num_employees'].quantile(0.95))
plt.title("Funding Duration vs. Employee Count")


# 12. Correlation heatmap for numeric features
plt.figure(figsize=(8,6))
corr = df.select_dtypes('number').corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Numeric Feature Correlations")


# 13. Boxplots to spot outliers in funding rounds
fund_cols = ['first_funding_usd', 'last_funding_usd']
plt.figure(figsize=(8,4))
sns.boxplot(data=df[fund_cols])
plt.yscale('log')
plt.title("Funding Amounts (log scale)")

# 14. Time series: companies funded per year
fund_year = df['first_funding_on'].dt.year.value_counts().sort_index()
plt.figure(figsize=(12,4))
sns.lineplot(x=fund_year.index, y=fund_year.values)
plt.title("Newly Funded Companies per Year")
plt.xlabel("Year")
plt.ylabel("Count")

# 15. Summary table of key metrics by industry
industry_metrics = df.groupby('industry_group').agg({
    'first_funding_usd': ['median', 'count'],
    'num_employees': 'median'
}).sort_values(('first_funding_usd','count'), ascending=False).head(10)
industry_metrics.columns = ['funding_median','company_count','employees_median']
industry_metrics


# 16. Final check
print("Cleaned data shape:", df.shape)


