# Imports

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100

import sklearn
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import metrics

from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import confusion_matrix

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../data/final_startup_data.csv', index_col=[0])

# Scaling

Scaling continous variable columns in the dataset

In [None]:
# Creating instance of scaler
scaler = StandardScaler()

# Creating list of columns with continuous variables to be scaled
columns_to_scale = ['funding_total_usd',
                    'seed',
                    'venture',
                    'equity_crowdfunding',
                    'undisclosed',
                    'convertible_note',
                    'debt_financing',
                    'angel',
                    'grant',
                    'private_equity', 
                    'round_A', 
                    'round_B',
                    'days_from_founding_to_funding',
                    'time_between_first_and_last_funding']

# Scaling continuous variable columns
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

# Test Train Split for Modeling

In [None]:
# Splitting dataset into train and test sets
train, test = train_test_split(data, test_size=0.2, random_state = 42)

# Creating X,y for train
X_train = train.drop(columns = 'target')
y_train = train.target

# Creating X,y for test
X_test = test.drop(columns = 'target')
y_test = test.target

# Baseline Model

### Fitting and Predicting

In [None]:
# Creating instance of Logistic Regression for baseline model
base_logreg = LogisticRegression(random_state=42, max_iter= 10**4)

In [None]:
# Fitting baseline model
base_logreg.fit(X_train, y_train)

### Evaluating

#### Train Set

In [5]:
# Predicting on train set
y_base_log_train_preds = base_logreg.predict(X_train)

# Testing target variables prediction against real for training set
print(classification_report(y_train, y_base_log_train_preds))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12873
           1       0.09      0.01      0.02      1505

    accuracy                           0.89     14378
   macro avg       0.50      0.50      0.48     14378
weighted avg       0.81      0.89      0.84     14378



In [8]:
# Creating confusion matrix for y_train and y_base_preds_train
confusion_matrix(y_train, y_base_log_train_preds)

array([[12711,   162],
       [ 1488,    17]])

#### Test Set

In [None]:
# Predicting on test set
y_base_log_test_preds = base_logreg.predict(X_test)

# Testing target variables prediction against real for test set
print(classification_report(y_test, y_base_log_test_preds))

In [None]:
# Creating confusion matrix for y_test and y_base_preds_test
confusion_matrix(y_test, y_base_log_test_preds)

# Modelling with Downsampling

## Dealing With Class Imbalance

In [9]:
# Subsetting observation with target variable value 1
acquired_down = train[train.target ==1]

# Subsetting observation with target variable value 0
not_acquired_down = train[train.target ==0]

In [10]:
# Downsampling majority class (0) with replacement to be equal to minority class (1) 
not_acquired_downsampled = resample(not_acquired,
                                replace = True, 
                                n_samples = (len(acquired)), 
                                random_state = 23) 

In [11]:
# Creating new downsampled dataset df_down
df_down = pd.concat([acquired_down, not_acquired_downsampled])

In [12]:
# Showing new balanced value counts for target variable
df_down.target.value_counts()

1    1875
0    1875
Name: target, dtype: int64

In [None]:
# Creating new X for downsampled model training
X_down = df_down.drop(columns='target')

# Creating new X for downsampled model training
y_down = df_down.target

## Logistic Regression

### Fitting Model

In [18]:
# Creating new instance of Logistic Regression model
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=2000, 
                            l1_ratio=1.0
                            )


In [19]:
# Fitting downsampled dataset df_down to model for training
logReg.fit(X_down, y_down)

LogisticRegression(C=0.01, class_weight='balanced', l1_ratio=1.0, max_iter=2000,
                   penalty='elasticnet', solver='saga')

### Evaluating Model

#### Train Set

In [20]:
# Using trained model to predict on downsampled train set
y_down_log_train_preds = logReg.predict(X_down)

# Testing target variables prediction against real for train set
print(classification_report(y_down, y_down_log_train_preds))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60      1515
           1       0.59      0.59      0.59      1485

    accuracy                           0.60      3000
   macro avg       0.60      0.60      0.60      3000
weighted avg       0.60      0.60      0.60      3000



In [None]:
# Creating confusion matrix for y_down and y_down_pred_train
confusion_matrix(y_down, y_down_log_train_preds)

#### Testing Set

In [21]:
# Using trained model to predict on downsampled test set
y_down_log_test_preds = logReg.predict(X_test)

# Testing target variables prediction against real for test set
print(classification_report(y_test, y_down_log_test_preds))

              precision    recall  f1-score   support

           0       0.60      0.59      0.60       364
           1       0.62      0.63      0.62       386

    accuracy                           0.61       750
   macro avg       0.61      0.61      0.61       750
weighted avg       0.61      0.61      0.61       750



In [None]:
# Creating confusion matrix for y_test and y_down_pred_test
confusion_matrix(y_test, y_down_log_test_preds)

## GridSearch Logistic Regression

### Fitting Model

In [23]:
# Creating parameter grid for GridSearch
param_grid_down = { 
    'solver': ['saga'],
    'C':[.3, .5],
    'penalty':['elasticnet'],
    'max_iter':[200, 1000],
    'l1_ratio':[.2, 1.0]
               }

In [24]:
# Creating instance of GridSearch for logistic regression including param_grid_down
grid_tree=GridSearchCV(LogisticRegression(), param_grid_down, verbose=1, n_jobs=-1)

In [25]:
# Fitting downsampled training data to GridSearch instance
grid_tree.fit(X_down, y_down)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  7.1min finished


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.3, 0.5], 'l1_ratio': [0.2, 1.0],
                         'max_iter': [200, 1000], 'penalty': ['elasticnet'],
                         'solver': ['saga']},
             verbose=1)

### Evaluating Model

#### Train Set

In [26]:
# Using trained model to predict on downsampled train set
y_down_grid_train_preds = grid_tree.best_estimator_.predict(X_down)

In [27]:
# Testing target variables prediction against real for train set
print(classification_report(y_down, y_down_grid_train_preds))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72       360
           1       0.74      0.75      0.74       390

    accuracy                           0.73       750
   macro avg       0.73      0.73      0.73       750
weighted avg       0.73      0.73      0.73       750



In [None]:
# Creating confusion matrix for y_down and y_down_grid_train_preds
confusion_matrix(y_down, y_down_grid_train_preds)

#### Test Set

In [None]:
# Using trained model to predict on downsampled test set
y_down_grid_test_preds = grid_tree.best_estimator_.predict(X_down)

In [29]:
# Testing target variables prediction against real for test set
print(classification_report(y_test, y_down_grid_test_preds))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72       360
           1       0.74      0.75      0.74       390

    accuracy                           0.73       750
   macro avg       0.73      0.73      0.73       750
weighted avg       0.73      0.73      0.73       750



In [None]:
# Creating confusion matrix for y_down and y_down_grid_test_preds
confusion_matrix(y_down, y_down_grid_test_preds)

In [None]:
# Setting and testing thresholds to maximize precision score
THRESHOLD_down = 0.3
y_pred_prob = grid_tree.predict_proba(X_test)[:, 1]
y_pred_class = np.where(y_pred_prob > THRESHOLD_down, 1, 0)

# Modelling with Downsampling and Upsampling

## Dealing With Class Imbalance

### Upsampling Acquired

In [33]:
# Subsetting observation with target variable value 1
acquired_up = train[train.target == 1]

# Subsetting observation with target variable value 0
not_acquired_up = train[train.target == 0]

In [34]:
# Upsampling minority class (1) with replacement to be equal to 5000 observations
acquired_upsampled = resample(acquired_up,
                              replace=True, 
                              n_samples=5000, 
                              random_state=42) 

In [35]:
# Creating new upsampled dataset df_up
df_up = pd.concat([not_acquired_up, acquired_upsampled])

### Downsampling Not Acquired

#### Resample

In [36]:
# Subsetting observation with target variable value 0
not_acquired_up_down = upsampled_data[upsampled_data.target == 0]

# Subsetting observation with target variable value 1
acquired_up_down = upsampled_data[upsampled_data.target == 1]

In [37]:
# Downsampling majority class (0) with replacement to be equal to 10000 observations
not_acquired_downsampled = resample(not_acquired_up_down,
                                 replace=True, 
                                 n_samples=10000, 
                                 random_state=42) 

In [38]:
# Creating new upsampled and downsampled dataset df_up_down
df_up_down = pd.concat([acquired_up_down, not_acquired_downsampled])

#### Tomek Links

In [39]:
# Creating X,y from df_up_down to use for Tomek Links
X_up_down = df_up_down.drop('target', axis=1)
y_up_down = df_up_down['target']

In [41]:
# Fitting Tomek Links to df_up_down
X_links, y_links = TomekLinks().fit_resample(X_train, y_train)

## Logistc Regression using GridSearch

### Fitting Model

In [45]:
# Creating new instance of Logistic Regression model
lr_links = LogisticRegression()

In [46]:
# Creating parameter grid for GridSearch
param_grid_links_lr = {
    
    'class_weight': [None,'balanced'],
    'solver': ['liblinear','sag','saga'],
    'max_iter': list(range(100,1000,25)) 
}

In [47]:
# Creating instance of GridSearch for logistic regression including param_grid_links_lr
lr_links_grid = RandomizedSearchCV(lr_links, 
                             param_grid_links_lr, 
                             scoring='precision', 
                             n_jobs=-1, 
                             verbose=2)

In [48]:
# Fitting up and downsampled training data to GridSearch instance
lr_links_grid.fit(X_links, y_links)

LogisticRegression()

### Evaluating Model

#### Train Set

In [None]:
# Using trained model to predict on up and downsampled train set
y_links_grid_train_preds = lr_links_grid.predict(X_links)

In [None]:
# Testing target variables prediction against real for train set
print(classification_report(y_links, y_links_grid_train_preds))

In [None]:
# Creating confusion matrix for y_test and y_links_grid_test_preds
confusion_matrix(y_links, y_links_grid_train_preds)

#### Test Set

In [49]:
# Using trained model to predict on up and downsampled test set
y_links_grid_test_preds = lr_links_grid.predict(X_test)

In [50]:
# Testing target variables prediction against real for test set
print(classification_report(y_test, y_links_grid_test_preds))

              precision    recall  f1-score   support

           0       0.80      0.05      0.09      1954
           1       0.35      0.98      0.52      1046

    accuracy                           0.37      3000
   macro avg       0.58      0.51      0.31      3000
weighted avg       0.65      0.37      0.24      3000



In [None]:
# Creating confusion matrix for y_test and y_links_grid_test_preds
confusion_matrix(y_test, y_links_grid_test_preds)

In [52]:
THRESHOLD = 0.95
thesh_preds = np.where(lr.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

## Random Forest 1

### Fitting Model

In [54]:
# Creating new instance of Random Forest model
rf_links = RandomForestClassifier(n_jobs=-1)

In [55]:
# Creating parameter grid for Randomized Search
param_grid_links_rf = {  
    'n_estimators': [100,200,300],
    'criterion': ['gini','entropy'],
    'max_depth': list(range(5,7)),
    'max_features': list(range(100,1000,100)),
    'min_samples_leaf': [1,2,3,4,5]
}

In [56]:
# Creating instance of Randomized Search for Random Forest including param_grid_links_rf
grid_tree_rf = RandomizedSearchCV(rf_links, 
                                  param_grid_links_rf, 
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-1,
                                  scoring='precision_micro'
)

In [57]:
# Fitting up and downsampled training data to GridSearch instance
grid_tree_rf.fit(X_links, y_links)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.6min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [5, 6],
                                        'max_features': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'n_estimators': [100, 200, 300]},
                   scoring='precision_micro', verbose=1)

### Evaluating Model

#### Train Set

In [None]:
# Using trained model to predict on up and downsampled train set
y_links_rand_train_preds = lr_links_grid.predict(X_links)

In [None]:
# Testing target variables prediction against real for train set
print(classification_report(y_links, y_links_rand_train_preds))

In [None]:
# Creating confusion matrix for y_links and y_links_rand_train_preds
confusion_matrix(y_links, y_links_rand_train_preds)

#### Test Set

In [60]:
# Using trained model to predict on up and downsampled test set
y_links_rand_test_preds = lr_links_grid.predict(X_test)

In [61]:
# Testing target variables prediction against real for test set
print(classification_report(y_test, y_links_rand_test_preds))

              precision    recall  f1-score   support

           0       0.66      0.96      0.78      1954
           1       0.49      0.08      0.14      1046

    accuracy                           0.65      3000
   macro avg       0.58      0.52      0.46      3000
weighted avg       0.60      0.65      0.56      3000



In [None]:
# Creating confusion matrix for y_test and y_links_rand_test_preds
confusion_matrix(y_test, y_links_rand_test_preds)

## Random Forest 2

### Fitting Model

In [62]:
# Creating new instance of Random Forest model updated with finding from first iteration
rf_links_2 = RandomForestClassifier(n_jobs=-1, min_samples_leaf = 1, criterion = 'entropy')

In [63]:
# Creating parameter grid for Random Search
param_grid_links_rf_2 = {  
    'n_estimators': list(range(150,800, 50)),
    'max_depth': list(range(3,8)),
    'max_features': list(range(200,600,25)),
}

In [64]:
# Creating instance of Random Search for Random Forest including param_grid_links_rf_2
grid_tree_rf_2 = RandomizedSearchCV(rf_links_2, 
                                  param_grid_links_rf_2, 
                                  cv=10,
                                  verbose=2,
                                  n_jobs=-1,
                                  scoring='precision'
)

In [65]:
# Fitting up and downsampled training data to Random Search instance
grid_tree_rf_2.fit(X_links, y_links)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 15.1min finished


RandomizedSearchCV(cv=10,
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    n_jobs=-1),
                   n_jobs=-1,
                   param_distributions={'max_depth': [3, 4, 5, 6, 7],
                                        'max_features': [200, 225, 250, 275,
                                                         300, 325, 350, 375,
                                                         400, 425, 450, 475,
                                                         500, 525, 550, 575],
                                        'n_estimators': [150, 200, 250, 300,
                                                         350, 400, 450, 500,
                                                         550, 600, 650, 700,
                                                         750]},
                   scoring='precision', verbose=2)

### Evaluating Model

#### Train Set

In [None]:
# Using trained model to predict on up and downsampled train set
y_links_rand_2_train_preds = lr_links_grid.predict(X_links)

In [None]:
# Testing target variables prediction against real for train set
print(classification_report(y_links, y_links_rand_2_train_preds))

In [None]:
# Creating confusion matrix for y_links and y_links_rand_2_train_preds
confusion_matrix(y_links, y_links_rand_2_train_preds)

#### Test Set

In [68]:
# Using trained model to predict on up and downsampled test set
y_links_rand_2_test_preds = lr_links_grid.predict(X_test)

In [69]:
# Testing target variables prediction against real for test set
print(classification_report(y_test, y_links_rand_2_test_preds))

              precision    recall  f1-score   support

           0       0.65      1.00      0.79      1954
           1       0.00      0.00      0.00      1046

    accuracy                           0.65      3000
   macro avg       0.33      0.50      0.39      3000
weighted avg       0.42      0.65      0.51      3000



In [None]:
# Creating confusion matrix for y_test and y_links_rand_2_test_preds
confusion_matrix(y_test, y_links_rand_2_test_preds)