In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import yaml
import warnings
import datetime
import sklearn
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

### collect data from CSV

https://www.kaggle.com/manishkc06/startup-success-prediction

In [2]:
startup = pd.read_csv('data//in//startup_data.csv')

In [3]:
startup.head(5)

Unnamed: 0,unamed:0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,1005,CA,42.35888,-71.05682,92101,c:6669,San Diego,,Bandsintown,1.0,...,c:6669,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,acquired
1,204,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,,TriCipher,1.0,...,c:16283,1.0,0.0,0.0,1.0,1.0,1.0,4.75,1.0,acquired
2,1001,CA,32.901049,-117.192656,92121,c:65620,San Diego,San Diego CA 92121,Plixi,1.0,...,c:65620,0.0,0.0,1.0,0.0,0.0,0.0,4.0,1.0,acquired
3,738,CA,37.320309,-122.05004,95014,c:42668,Cupertino,Cupertino CA 95014,Solidcore Systems,1.0,...,c:42668,0.0,0.0,0.0,1.0,1.0,1.0,3.3333,1.0,acquired
4,1002,CA,37.779281,-122.419236,94105,c:65806,San Francisco,San Francisco CA 94105,Inhale Digital,0.0,...,c:65806,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,closed


In [4]:
startup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923 entries, 0 to 922
Data columns (total 49 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   unamed:0                  923 non-null    object 
 1   state_code                915 non-null    object 
 2   latitude                  915 non-null    float64
 3   longitude                 915 non-null    float64
 4   zip_code                  915 non-null    object 
 5   id                        915 non-null    object 
 6   city                      915 non-null    object 
 7   Unnamed: 6                428 non-null    object 
 8   name                      915 non-null    object 
 9   labels                    915 non-null    float64
 10  founded_at                915 non-null    object 
 11  closed_at                 333 non-null    object 
 12  first_funding_at          915 non-null    object 
 13  last_funding_at           915 non-null    object 
 14  age_first_

# Data cleaning

We take just the columns that could be used in a ML algorithm

In [5]:
startup_clean = startup[['name','age_first_funding_year','age_last_funding_year','age_first_milestone_year','age_last_milestone_year', 'relationships','funding_rounds', 'funding_total_usd','is_CA','is_TX','is_NY'
         ,'is_MA', 'is_otherstate', 'is_software', 'is_web', 'is_mobile', 'is_enterprise','is_advertising','is_gamesvideo','is_ecommerce','is_biotech','is_consulting','is_othercategory','has_angel',
        'has_VC','has_roundA','has_roundB','has_roundC','has_roundD','avg_participants', 'status']]

We convert the 'status', that will be our Y to predict, in a boolean variable

In [6]:
startup_clean['status'] = startup_clean['status'].replace('acquired', 1)
startup_clean['status'] = startup_clean['status'].replace('closed', 0)

startup_clean = startup_clean.rename(columns={'status': 'has_succeded'})

In [7]:
startup_clean = startup_clean[startup_clean['has_succeded'].notna()]

In [8]:
startup_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 915 entries, 0 to 922
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      915 non-null    object 
 1   age_first_funding_year    915 non-null    float64
 2   age_last_funding_year     915 non-null    float64
 3   age_first_milestone_year  764 non-null    float64
 4   age_last_milestone_year   764 non-null    float64
 5   relationships             915 non-null    float64
 6   funding_rounds            915 non-null    float64
 7   funding_total_usd         915 non-null    float64
 8   is_CA                     915 non-null    float64
 9   is_TX                     915 non-null    float64
 10  is_NY                     915 non-null    float64
 11  is_MA                     915 non-null    float64
 12  is_otherstate             915 non-null    float64
 13  is_software               915 non-null    float64
 14  is_web    

In [9]:
startup[startup['age_first_milestone_year'].isna()][['milestones','age_first_milestone_year','age_last_milestone_year']]

Unnamed: 0,milestones,age_first_milestone_year,age_last_milestone_year
12,0.0,,
15,0.0,,
24,0.0,,
28,,,
32,0.0,,
...,...,...,...
885,0.0,,
903,0.0,,
904,0.0,,
906,0.0,,


in order to avoid the null value on the fields 'age_first_milestone_year' and 'age_last_milestone_year' 
we procedeed in this way: <br>
    - calculate the mean of these value <br>
    - replace the nan with the mean

In [10]:
age_fmy_mean = startup_clean['age_first_milestone_year'].mean()
age_lmy_mean = startup_clean['age_last_milestone_year'].mean()

startup_clean['age_first_milestone_year']= startup_clean['age_first_milestone_year'].fillna(age_fmy_mean)
startup_clean['age_last_milestone_year']= startup_clean['age_last_milestone_year'].fillna(age_fmy_mean)

check on null values

In [11]:
startup_clean.isnull().sum()

name                        0
age_first_funding_year      0
age_last_funding_year       0
age_first_milestone_year    0
age_last_milestone_year     0
relationships               0
funding_rounds              0
funding_total_usd           0
is_CA                       0
is_TX                       0
is_NY                       0
is_MA                       0
is_otherstate               0
is_software                 0
is_web                      0
is_mobile                   0
is_enterprise               0
is_advertising              0
is_gamesvideo               0
is_ecommerce                0
is_biotech                  0
is_consulting               0
is_othercategory            0
has_angel                   0
has_VC                      0
has_roundA                  0
has_roundB                  0
has_roundC                  0
has_roundD                  0
avg_participants            0
has_succeded                0
dtype: int64

# Kfold validation

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

function to get score of a model in a simpler way

In [33]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return fit.score(X_test, y_test)

In [13]:
X = startup_clean[['age_first_funding_year','age_last_funding_year','age_first_milestone_year','age_last_milestone_year', 'relationships','funding_rounds', 'funding_total_usd','is_CA','is_TX','is_NY'
        ,'is_MA', 'is_otherstate', 'is_software', 'is_web', 'is_mobile', 'is_enterprise','is_advertising','is_gamesvideo','is_ecommerce','is_biotech','is_consulting','is_othercategory','has_angel',
       'has_VC','has_roundA','has_roundB','has_roundC','has_roundD','avg_participants']].to_numpy()

X1 =startup_clean[['age_first_funding_year', 'age_last_funding_year']]

y = startup_clean['has_succeded'].to_numpy()

In [14]:
print(X.shape)
print(y.shape)

(915, 29)
(915,)


In [15]:
X

array([[2.2493, 3.0027, 4.6685, ..., 0.    , 0.    , 1.    ],
       [5.126 , 9.9973, 7.0055, ..., 1.    , 1.    , 4.75  ],
       [1.0329, 1.0329, 1.4575, ..., 0.    , 0.    , 4.    ],
       ...,
       [8.4959, 8.4959, 9.0055, ..., 0.    , 1.    , 8.    ],
       [0.7589, 2.8329, 0.7589, ..., 0.    , 0.    , 1.    ],
       [3.1205, 3.1205, 4.0027, ..., 0.    , 0.    , 3.    ]])

In [16]:
y.sum()

591.0

In [59]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [46]:
folds = StratifiedKFold(n_splits=10)

# Logistic Regression

In [19]:
lr = LogisticRegression()

In [20]:
#fit = lr.fit(X_train, y_train)
#fit.score(X_test, y_test)

In [36]:
score_l =[]

In [48]:
for train_index, test_index in folds.split(X,y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    score_l.append(get_score(LogisticRegression() ,X_train, X_test, y_train, y_test))  
    

In [49]:
score_l

[0.6521739130434783,
 0.6413043478260869,
 0.6413043478260869,
 0.6413043478260869,
 0.6413043478260869,
 0.6483516483516484,
 0.6483516483516484,
 0.6483516483516484,
 0.6483516483516484,
 0.6483516483516484]

# Random Forest

In [22]:
rf=RandomForestClassifier(n_estimators=40)

In [23]:
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=40)

In [27]:
rf.score(X_test, y_test)

0.9127272727272727

In [58]:
cross_val_score(RandomForestClassifier(), X, y, cv=10)

array([0.77173913, 0.77173913, 0.76086957, 0.82608696, 0.7826087 ,
       0.79120879, 0.75824176, 0.82417582, 0.8021978 , 0.73626374])