In [1]:
import os
import json
import numpy as np
import pandas as pd
import operator
from datetime import datetime
import datetime
import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder

In [2]:
data_path=os.path.join(os.path.abspath(os.getcwd()),'raw_data')
df=pd.read_csv(os.path.join(data_path,'startups.csv'))

# Data exploration

In [3]:
df.shape

(314486, 19)

In [4]:
df.drop_duplicates(inplace=True)
df.shape

(312667, 19)

In [5]:
df.columns

Index(['id', 'name', 'website', 'short_description', 'ipo_status',
       'founded_on', 'went_public_on', 'exited_on', 'num_funding_rounds',
       'last_equity_funding_type', 'last_equity_funding_total',
       'last_funding_at', 'headquartersCountry', 'headquartersRegion',
       'employeeCount', 'industry_name', 'technology_name', 'announcedOn',
       'moneyRaised'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,id,name,website,short_description,ipo_status,founded_on,went_public_on,exited_on,num_funding_rounds,last_equity_funding_type,last_equity_funding_total,last_funding_at,headquartersCountry,headquartersRegion,employeeCount,industry_name,technology_name,announcedOn,moneyRaised
0,75139f4c-6d9d-4a03-8b72-61f73f7be74a,Youcan Robot,https://www.youcanrobot.com/,Youcan Robot is an integrated company in the w...,private,2016-01-01,,,1,seed,,2018-05-30,CN,Shanghai,31.0,Video,Science and Engineering,2018-05-30 00:00:00.000,
1,75139f4c-6d9d-4a03-8b72-61f73f7be74a,Youcan Robot,https://www.youcanrobot.com/,Youcan Robot is an integrated company in the w...,private,2016-01-01,,,1,seed,,2018-05-30,CN,Shanghai,31.0,Video,Hardware,2018-05-30 00:00:00.000,
2,75139f4c-6d9d-4a03-8b72-61f73f7be74a,Youcan Robot,https://www.youcanrobot.com/,Youcan Robot is an integrated company in the w...,private,2016-01-01,,,1,seed,,2018-05-30,CN,Shanghai,31.0,Computer Hardware,Science and Engineering,2018-05-30 00:00:00.000,
3,75139f4c-6d9d-4a03-8b72-61f73f7be74a,Youcan Robot,https://www.youcanrobot.com/,Youcan Robot is an integrated company in the w...,private,2016-01-01,,,1,seed,,2018-05-30,CN,Shanghai,31.0,Computer Hardware,Hardware,2018-05-30 00:00:00.000,
4,75139f4c-6d9d-4a03-8b72-61f73f7be74a,Youcan Robot,https://www.youcanrobot.com/,Youcan Robot is an integrated company in the w...,private,2016-01-01,,,1,seed,,2018-05-30,CN,Shanghai,31.0,Media and Entertainment,Science and Engineering,2018-05-30 00:00:00.000,


In [7]:
df.describe()

Unnamed: 0,num_funding_rounds,employeeCount
count,312667.0,300442.0
mean,3.406656,62.002882
std,2.419148,444.24482
min,1.0,0.0
25%,2.0,6.0
50%,3.0,31.0
75%,5.0,31.0
max,22.0,32098.0


In [8]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
missing_value_df 

Unnamed: 0,column_name,percent_missing
id,id,0.0
name,name,0.0
website,website,1.021534
short_description,short_description,0.000959
ipo_status,ipo_status,0.0
founded_on,founded_on,0.0
went_public_on,went_public_on,98.285716
exited_on,exited_on,92.975274
num_funding_rounds,num_funding_rounds,0.0
last_equity_funding_type,last_equity_funding_type,8.140929


In [9]:
columns_to_keep=missing_value_df[percent_missing<=100].index
len(columns_to_keep)

19

In [10]:
columns_to_keep

Index(['id', 'name', 'website', 'short_description', 'ipo_status',
       'founded_on', 'went_public_on', 'exited_on', 'num_funding_rounds',
       'last_equity_funding_type', 'last_equity_funding_total',
       'last_funding_at', 'headquartersCountry', 'headquartersRegion',
       'employeeCount', 'industry_name', 'technology_name', 'announcedOn',
       'moneyRaised'],
      dtype='object')

In [11]:
df=df[columns_to_keep]
df.shape

(312667, 19)

In [12]:
string_list=['technology_name']

string_to_search=string_list[0]

columns_list=[column for column in df.columns if string_to_search in column.lower()]
print(columns_list)

for string in columns_list:
    print(f'{string}: {round(df[~df[string].isnull()].shape[0]/df.shape[0],4)}')

['technology_name']
technology_name: 0.9522


In [13]:
df.ipo_status.value_counts()

private     307633
public        4957
delisted        77
Name: ipo_status, dtype: int64

In [14]:
df.name.nunique()

55665

In [15]:
df.last_equity_funding_type.unique()

array(['seed', 'pre_seed', nan, 'series_unknown', 'private_equity',
       'series_a', 'angel', 'equity_crowdfunding',
       'initial_coin_offering', 'series_b', 'series_c', 'corporate_round',
       'post_ipo_equity', 'series_d', 'undisclosed', 'series_e',
       'series_f', 'series_g', 'series_h'], dtype=object)

In [16]:
df.name.value_counts()

Arthur Intelligence          216
Embodied                     144
SparkCharge                  120
Summit Nanotech              120
Flume                        120
                            ... 
Tabnex                         1
WinMiner                       1
Phoenix Natural Resources      1
WeldNote                       1
LeafyMade                      1
Name: name, Length: 55665, dtype: int64

# Fill missing and format

## last_equity_funding_total et moneyRaised

In [17]:
df['last_equity_funding_total'].unique()

array([nan, '{"currency":"USD","amount":50000000,"amountUSD":50000000}',
       '{"currency":"CNY","amount":118000000,"amountUSD":17616400}', ...,
       '{"currency":"USD","amount":3560999900,"amountUSD":3560999900}',
       '{"currency":"USD","amount":1048459800,"amountUSD":1048459800}',
       '{"currency":"USD","amount":40439800,"amountUSD":40439800}'],
      dtype=object)

In [18]:
df['moneyRaised'].unique()

array([nan,
       '{"amount": 5000000, "currency": "USD", "amountUSD": 5000000}',
       '{"amount": 50000000, "currency": "USD", "amountUSD": 50000000}',
       ...,
       '{"amount": 11232100, "currency": "USD", "amountUSD": 11232100}',
       '{"amount": 1799800, "currency": "USD", "amountUSD": 1799800}',
       '{"amount": 40439800, "currency": "USD", "amountUSD": 40439800}'],
      dtype=object)

In [19]:
print(df['last_equity_funding_total'].isnull().sum(),df['moneyRaised'].isnull().sum())

77533 90069


In [20]:
def preprocess_moneyRaised(x):
    return float(json.loads(x)["amountUSD"])

In [21]:
#x=df['last_equity_funding_total'][2]
#print(x, preprocess_moneyRaised(x))

In [22]:
#json.loads(df['last_equity_funding_total'][2])['amountUSD']

In [23]:
for column in ['last_equity_funding_total', 'moneyRaised']:
    df[column].replace(np.nan,'{"amount": 0, "currency": "USD", "amountUSD": 0}', inplace=True)
    df[column].replace('{"value":null,"currency":"USD","value_usd":null}','{"amount": 0, "currency": "USD", "amountUSD": 0}', inplace=True)
    df[column] = df[column].apply(lambda x: preprocess_moneyRaised(x))

In [24]:
print(df['last_equity_funding_total'].isnull().sum(),df['moneyRaised'].isnull().sum())

0 0


In [25]:
print(df['last_equity_funding_total'].dtype, df['moneyRaised'].dtype)

float64 float64


## Date columns

In [26]:
date_columns=['founded_on', 'went_public_on', 'exited_on','last_funding_at','announcedOn']
df[date_columns] = df[date_columns].apply(pd.to_datetime)

In [27]:
for column in date_columns:
    print(f'{column}: {df[column].dtype}')

founded_on: datetime64[ns]
went_public_on: datetime64[ns]
exited_on: datetime64[ns]
last_funding_at: datetime64[ns]
announcedOn: datetime64[ns]


In [28]:
for column in date_columns:
    print(f'{column}: {df[column].isnull().sum()}')

founded_on: 0
went_public_on: 307307
exited_on: 290703
last_funding_at: 0
announcedOn: 651


In [29]:
#for date_column in date_columns:
    #if df[date_column].isnull().sum()!=0:
        #df[date_column].fillna(datetime.datetime.min,inplace=True)

In [30]:
for column in date_columns:
    print(f'{column}: {df[column].isnull().sum()}')

founded_on: 0
went_public_on: 307307
exited_on: 290703
last_funding_at: 0
announcedOn: 651


# Target encoding

In [31]:
print(df.went_public_on.notnull().sum(),df.went_public_on.isnull().sum(),df.went_public_on.notnull().sum()+df.went_public_on.isnull().sum())

5360 307307 312667


In [32]:
print(df.exited_on.notnull().sum(),df.exited_on.isnull().sum(),df.exited_on.notnull().sum()+df.exited_on.isnull().sum())

21964 290703 312667


In [33]:
df[(df.went_public_on.notnull()) | (df.exited_on.notnull())].shape

(21964, 19)

In [34]:
df.loc[(df.went_public_on.notnull()) | (df.exited_on.notnull()), 'Target'] = 1
df.loc[(df.went_public_on.isnull()) & (df.exited_on.isnull()), 'Target'] = 0

In [35]:
df.Target.value_counts()

0.0    290703
1.0     21964
Name: Target, dtype: int64

In [36]:
df.loc[(df.exited_on.notnull()) & (df.went_public_on.notnull())].shape

(5360, 20)

In [37]:
df[df.exited_on==df.went_public_on].shape

(5287, 20)

In [38]:
df[df.exited_on!=df.went_public_on].shape

(307380, 20)

In [39]:
df['exited_on'].dtype

dtype('<M8[ns]')

In [40]:
df.drop(columns=['went_public_on', 'exited_on'],inplace = True)

In [41]:
df.Target.value_counts()

0.0    290703
1.0     21964
Name: Target, dtype: int64

In [42]:
#df.loc[(df.Target==1) & (df.exited_on.notnull()), 'time_to_success'] = df['exited_on']-df['founded_on']
#df.loc[(df.Target==1) & (df.went_public_on.notnull()), 'time_to_success'] = df['went_public_on']-df['founded_on']

In [43]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312667 entries, 0 to 314485
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   id                         312667 non-null  object        
 1   name                       312667 non-null  object        
 2   website                    309473 non-null  object        
 3   short_description          312664 non-null  object        
 4   ipo_status                 312667 non-null  object        
 5   founded_on                 312667 non-null  datetime64[ns]
 6   num_funding_rounds         312667 non-null  int64         
 7   last_equity_funding_type   287213 non-null  object        
 8   last_equity_funding_total  312667 non-null  float64       
 9   last_funding_at            312667 non-null  datetime64[ns]
 10  headquartersCountry        311528 non-null  object        
 11  headquartersRegion         311485 non-null  object  

In [44]:
#df[df['time_to_success'].notnull()]['time_to_success']

In [45]:
#df['time_to_success']=df['time_to_success'].dt.components.days

In [46]:
#df[df['time_to_success'].notnull()].shape

In [47]:
df.isnull().sum()

id                               0
name                             0
website                       3194
short_description                3
ipo_status                       0
founded_on                       0
num_funding_rounds               0
last_equity_funding_type     25454
last_equity_funding_total        0
last_funding_at                  0
headquartersCountry           1139
headquartersRegion            1182
employeeCount                12225
industry_name                 9118
technology_name              14937
announcedOn                    651
moneyRaised                      0
Target                           0
dtype: int64

# Fill missing values

In [48]:
df.shape

(312667, 18)

In [49]:
column_types=['object', 'int64', 'float64']

for column_type in column_types:

    columns_to_fill= df.select_dtypes(include=column_type).columns.to_list()

    for column_to_fill in columns_to_fill:
        
        if df[column_to_fill].isnull().sum()!=0:
            
            if column_type=='object': 
                print(column_to_fill)
                df[column_to_fill].fillna("unknown",inplace=True)
            
            else:
                if column_to_fill !='time_to_success':
                    print(column_to_fill)
                    df[column_to_fill].fillna(0,inplace=True)

website
short_description
last_equity_funding_type
headquartersCountry
headquartersRegion
industry_name
technology_name
employeeCount


In [50]:
#column_types=['object', 'int64', 'float64','datetime64[ns]']

#for column_type in column_types:

    #columns_to_fill= df.select_dtypes(include=column_type).columns.to_list()

    #for column_to_fill in columns_to_fill:
        
        #if df[column_to_fill].isnull().sum()!=0:
            
            #if column_type=='object': 
                #print(column_to_fill)
                #df[column_to_fill].fillna("unknown",inplace=True)
            
            #if column_type=='datetime64[ns]': 
                #print(column_to_fill)
                #df[column_to_fill].fillna(datetime.datetime.min,inplace=True)
                
            #else:
   
                #print(column_to_fill)
                #df[column_to_fill].fillna(0,inplace=True)

In [51]:
df.drop(columns=['website', 'short_description'],inplace = True)

In [52]:
df.isnull().sum()

id                             0
name                           0
ipo_status                     0
founded_on                     0
num_funding_rounds             0
last_equity_funding_type       0
last_equity_funding_total      0
last_funding_at                0
headquartersCountry            0
headquartersRegion             0
employeeCount                  0
industry_name                  0
technology_name                0
announcedOn                  651
moneyRaised                    0
Target                         0
dtype: int64

In [53]:
df.last_equity_funding_type.unique()

array(['seed', 'pre_seed', 'unknown', 'series_unknown', 'private_equity',
       'series_a', 'angel', 'equity_crowdfunding',
       'initial_coin_offering', 'series_b', 'series_c', 'corporate_round',
       'post_ipo_equity', 'series_d', 'undisclosed', 'series_e',
       'series_f', 'series_g', 'series_h'], dtype=object)

In [54]:
df.shape

(312667, 16)

# Encoding industries and technologies

In [55]:
#def encode_column(df, column_to_encode):
    
    #encoder = OneHotEncoder(sparse=False)
    #df[encoder.get_feature_names_out()] = encoder.fit_transform(df[[column_to_encode]])
    
    #df.drop(columns=column_to_encode,inplace = True)
    
    #columns_lits=[column for column in df.columns.to_list() if column_to_encode in column]
    #columns_to_groupby=[column for column in df.columns.to_list() if column not in columns_lits]
    
    #df=df.groupby(columns_to_groupby).max().reset_index()
    
    #return df

In [56]:
#for column in ['industry_name','technology_name']:
    #print(column)
    #df=encode_column(df,column)
    #print(df.shape)