# Imports

In [7]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100

import sklearn
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import metrics

from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import confusion_matrix

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

# Baseline Model

In [2]:
data = pd.read_csv('training_startup_data.csv', index_col=[0])

In [3]:
training_data = data

In [4]:
base_logreg = LogisticRegression(random_state=42, max_iter= 10**4)

y_baseline = training_data['target']
X_baseline = training_data.drop('target', axis=1)

X_train , X_test, y_train, y_test = train_test_split(X_baseline, y_baseline, test_size = 0.20, random_state=42)


base_logreg.fit(X_train, y_train)

y_log_default_test = base_logreg.predict(X_test)
y_log_default_train = base_logreg.predict(X_train)

residuals1 = np.abs(y_train - y_log_default_train)
print('Training Data:')
print(pd.Series(residuals1).value_counts())
print(pd.Series(residuals1).value_counts(normalize=True))

Training Data:
0    12728
1     1650
Name: target, dtype: int64
0    0.885241
1    0.114759
Name: target, dtype: float64


In [5]:
trainrpt = print(classification_report(y_train, y_log_default_train))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12873
           1       0.09      0.01      0.02      1505

    accuracy                           0.89     14378
   macro avg       0.50      0.50      0.48     14378
weighted avg       0.81      0.89      0.84     14378



In [8]:
confusion_matrix(y_train, y_log_default_train)

array([[12711,   162],
       [ 1488,    17]])

# Model Sequence 1

## Dealing With Class Imbalance

### Downsampling Majority Class

In [9]:
acquired = training_data[training_data.target ==1]
not_acquired = training_data[training_data.target ==0]

In [10]:
not_acquired_downsampled = resample(not_acquired,
                                replace = False, 
                                n_samples = (len(acquired)), 
                                random_state = 23) 

In [11]:
df = pd.concat([acquired, not_acquired_downsampled])

In [12]:
df.target.value_counts()

1    1875
0    1875
Name: target, dtype: int64

## Scaling

In [13]:
y = df['target']
X = df.drop('target', axis=1)

In [14]:
scaler = StandardScaler()

In [15]:
columns_to_scale = ['funding_total_usd',
                    'seed',
                    'venture',
                    'equity_crowdfunding',
                    'undisclosed',
                    'convertible_note',
                    'debt_financing',
                    'angel',
                    'grant',
                    'private_equity', 
                    'round_A', 
                    'round_B',
                    'days_from_founding_to_funding',
                    'time_between_first_and_last_funding']

X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

### Pickling Scalar

In [16]:
import pickle

scaler_pickle_path = 'scaler_1.pkl'


scaler_pickle = open(scaler_pickle_path, 'wb')
pickle.dump(scaler, scaler_pickle)
scaler_pickle.close()

## Test Train Split

In [17]:
X_scaled_train , X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X, y, test_size = 0.20, random_state=42)

## Logistic Regression

In [18]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=2000, 
                            l1_ratio=1.0
                            )


In [19]:
logReg.fit(X_scaled_train, y_scaled_train)

LogisticRegression(C=0.01, class_weight='balanced', l1_ratio=1.0, max_iter=2000,
                   penalty='elasticnet', solver='saga')

In [20]:
y_scaled_train_lr = logReg.predict(X_scaled_train)


print(classification_report(y_scaled_train, y_scaled_train_lr))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60      1515
           1       0.59      0.59      0.59      1485

    accuracy                           0.60      3000
   macro avg       0.60      0.60      0.60      3000
weighted avg       0.60      0.60      0.60      3000



In [21]:
y_scaled_test_lr = logReg.predict(X_scaled_test)


print(classification_report(y_scaled_test_lr,y_scaled_test))

              precision    recall  f1-score   support

           0       0.60      0.59      0.60       364
           1       0.62      0.63      0.62       386

    accuracy                           0.61       750
   macro avg       0.61      0.61      0.61       750
weighted avg       0.61      0.61      0.61       750



### Pickling Logistic Regression Model

In [22]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(logReg, model_pickle)
model_pickle.close()

## GridSearch Logistic Regression

In [23]:
param_grid = { 
    'solver': ['saga'],
    'C':[.3, .5],
    'penalty':['elasticnet'],
    'max_iter':[200, 1000],
    'l1_ratio':[.2, 1.0]
               }

In [24]:
grid_tree=GridSearchCV(LogisticRegression(), param_grid, verbose=1, n_jobs=-1)

In [25]:
grid_tree.fit(X_scaled_train, y_scaled_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  7.1min finished


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.3, 0.5], 'l1_ratio': [0.2, 1.0],
                         'max_iter': [200, 1000], 'penalty': ['elasticnet'],
                         'solver': ['saga']},
             verbose=1)

In [26]:
y_pred = grid_tree.best_estimator_.predict(X_scaled_test)

In [27]:
print(classification_report(y_scaled_test,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72       360
           1       0.74      0.75      0.74       390

    accuracy                           0.73       750
   macro avg       0.73      0.73      0.73       750
weighted avg       0.73      0.73      0.73       750



In [28]:
y_pred_prob = grid_tree.predict_proba(X_test)[:, 1]
y_pred_class = np.where(y_pred_prob > 0.3, 1, 0)

In [29]:
print(classification_report(y_scaled_test,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72       360
           1       0.74      0.75      0.74       390

    accuracy                           0.73       750
   macro avg       0.73      0.73      0.73       750
weighted avg       0.73      0.73      0.73       750



### Pickling GridSearch Logistic Regression

In [30]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(grid_tree.best_estimator_, model_pickle)
model_pickle.close()

# Model Sequence 2

In [31]:
data_2 = pd.read_csv('training_startup_data.csv', index_col=[0])

In [32]:
X = data_2.drop(columns = 'target')
y = data_2.target

## Dealing With Class Imbalance

### Upsampling Acquired

In [33]:
acquired = data[data.target == 1]
not_acquired = data[data.target != 1]

In [34]:
acquired_upsampled = resample(acquired,
                              replace=True, 
                              n_samples=5000, 
                              random_state=42) 

In [35]:
upsampled_data = pd.concat([not_acquired, acquired_upsampled])

### Downsampling Not Acquired

#### Resample

In [36]:
not_acquired_down = upsampled_data[upsampled_data.target != 1]
acquired_down = upsampled_data[upsampled_data.target == 1]

In [37]:
not_acquired_downsampled = resample(not_acquired_down,
                                 replace=True, 
                                 n_samples=10000, 
                                 random_state=42) 

In [38]:
balanced = pd.concat([acquired_down, not_acquired_downsampled])

#### Tomek Links

##### Test Train Split

In [39]:
y_res = balanced['target']
X_res = balanced.drop('target', axis=1)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.20, random_state=23)

##### Resample

In [41]:
X_bal, y_bal = TomekLinks().fit_resample(X_train, y_train)

## Logistc Regression

### Scaling

In [44]:
scaler_2 = StandardScaler()

X_scale = X_bal

X_scale[columns_to_scale] = scaler_2.fit_transform(X_scale[columns_to_scale])

### Grid Search

In [45]:
lr = LogisticRegression()

In [46]:
param_grid_lr = {
    
    'class_weight': [None,'balanced'],
    'solver': ['liblinear','sag','saga'],
    'max_iter': list(range(100,1000,25)) 
}

In [47]:
lr_grid = RandomizedSearchCV(lr, 
                             param_grid_lr, 
                             scoring='precision', 
                             n_jobs=-1, 
                             verbose=2)

In [48]:
lr.fit(X_bal, y_bal)

LogisticRegression()

In [49]:
y_pred_lr = lr.predict(X_test)

In [50]:
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.80      0.05      0.09      1954
           1       0.35      0.98      0.52      1046

    accuracy                           0.37      3000
   macro avg       0.58      0.51      0.31      3000
weighted avg       0.65      0.37      0.24      3000



In [51]:
metrics.precision_score(y_test,y_pred_lr)

0.3549843695727683

In [52]:
THRESHOLD = 0.95
thesh_preds = np.where(lr.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

In [53]:
metrics.precision_score(y_test,thesh_preds)

0.3549843695727683

## Random Forest

### Iteration 1

In [54]:
rf = RandomForestClassifier(n_jobs=-1)

In [55]:
param_grid_rf = {  
    'n_estimators': [100,200,300],
    'criterion': ['gini','entropy'],
    'max_depth': list(range(5,7)),
    'max_features': list(range(100,1000,100)),
    'min_samples_leaf': [1,2,3,4,5]
}

In [56]:
grid_tree_rf = RandomizedSearchCV(rf, 
                                  param_grid_rf, 
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-1,
                                  scoring='precision_micro'
)

In [57]:
grid_tree_rf.fit(X_bal, y_bal)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.6min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [5, 6],
                                        'max_features': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'n_estimators': [100, 200, 300]},
                   scoring='precision_micro', verbose=1)

In [58]:
grid_tree_rf.best_params_

{'n_estimators': 200,
 'min_samples_leaf': 3,
 'max_features': 500,
 'max_depth': 6,
 'criterion': 'gini'}

In [59]:
grid_tree_rf.best_score_

0.7386475055835606

In [60]:
y_pred_rf_1 = grid_tree_rf.predict(X_test)

In [61]:
print(classification_report(y_test,y_pred_rf_1))

              precision    recall  f1-score   support

           0       0.66      0.96      0.78      1954
           1       0.49      0.08      0.14      1046

    accuracy                           0.65      3000
   macro avg       0.58      0.52      0.46      3000
weighted avg       0.60      0.65      0.56      3000



### Second Iteration

In [62]:
rf_2 = RandomForestClassifier(n_jobs=-1, min_samples_leaf = 1, criterion = 'entropy')

In [63]:
param_grid_rf_2 = {  
    'n_estimators': list(range(150,800, 50)),
    'max_depth': list(range(3,8)),
    'max_features': list(range(200,600,25)),
}

In [64]:
grid_tree_rf_2 = RandomizedSearchCV(rf_2, 
                                  param_grid_rf_2, 
                                  cv=10,
                                  verbose=2,
                                  n_jobs=-1,
                                  scoring='precision'
)

In [65]:
grid_tree_rf_2.fit(X_bal, y_bal)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 15.1min finished


RandomizedSearchCV(cv=10,
                   estimator=RandomForestClassifier(criterion='entropy',
                                                    n_jobs=-1),
                   n_jobs=-1,
                   param_distributions={'max_depth': [3, 4, 5, 6, 7],
                                        'max_features': [200, 225, 250, 275,
                                                         300, 325, 350, 375,
                                                         400, 425, 450, 475,
                                                         500, 525, 550, 575],
                                        'n_estimators': [150, 200, 250, 300,
                                                         350, 400, 450, 500,
                                                         550, 600, 650, 700,
                                                         750]},
                   scoring='precision', verbose=2)

In [66]:
grid_tree_rf_2.best_score_

0.793849364607973

In [67]:
grid_tree_rf_2.best_params_

{'n_estimators': 650, 'max_features': 350, 'max_depth': 3}

In [68]:
y_pred_rf_2 = grid_tree_rf_2.predict(X_test)

In [69]:
print(classification_report(y_test, y_pred_rf_2))

              precision    recall  f1-score   support

           0       0.65      1.00      0.79      1954
           1       0.00      0.00      0.00      1046

    accuracy                           0.65      3000
   macro avg       0.33      0.50      0.39      3000
weighted avg       0.42      0.65      0.51      3000



In [70]:
training_data

Unnamed: 0,funding_total_usd,seed,venture,equity_crowdfunding,undisclosed,convertible_note,debt_financing,angel,grant,private_equity,round_A,round_B,2.0,3d,accessories,accounting,active,ad,adherence,adults,advanced,adventure,advertising,advice,aerospace,agent,agriculture,algorithms,all,alumni,analyt,analytics,and,android,angels,animal,apis,app,application,applications,apps,aquaculture,architecture,archiving,art,artificial,artists,assessment,asset,assisitive,...,founded_year_2000,founded_year_2001,founded_year_2002,founded_year_2003,founded_year_2004,founded_year_2005,founded_year_2006,founded_year_2007,founded_year_2008,founded_year_2009,founded_year_2010,founded_year_2011,founded_year_2012,founded_year_2013,founded_year_2014,url_ending_aspx,url_ending_au,url_ending_biz,url_ending_ca,url_ending_cc,url_ending_co,url_ending_com,url_ending_de,url_ending_edu,url_ending_es,url_ending_fm,url_ending_htm,url_ending_html,url_ending_im,url_ending_in,url_ending_info,url_ending_io,url_ending_is,url_ending_it,url_ending_la,url_ending_ly,url_ending_me,url_ending_mobi,url_ending_net,url_ending_org,url_ending_other,url_ending_php,url_ending_se,url_ending_sh,url_ending_to,url_ending_tv,url_ending_uk,url_ending_us,url_ending_ws,target
15425,50000,0.0,0.0,0.0,50000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11489,2535300,1835300.0,700000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,700000.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19413,1500000,1500000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3114,4000000,0.0,4000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4689,1125000,0.0,0.0,0.0,0.0,0.0,1125000.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13201,20500000,0.0,20500000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13979,2899199,2000000.0,0.0,0.0,0.0,0.0,899199.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6248,120000,120000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
990,60500002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60500002.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [72]:
training_data.columns.to_list()

['funding_total_usd',
 'seed',
 'venture',
 'equity_crowdfunding',
 'undisclosed',
 'convertible_note',
 'debt_financing',
 'angel',
 'grant',
 'private_equity',
 'round_A',
 'round_B',
 '2.0',
 '3d',
 'accessories',
 'accounting',
 'active',
 'ad',
 'adherence',
 'adults',
 'advanced',
 'adventure',
 'advertising',
 'advice',
 'aerospace',
 'agent',
 'agriculture',
 'algorithms',
 'all',
 'alumni',
 'analyt',
 'analytics',
 'and',
 'android',
 'angels',
 'animal',
 'apis',
 'app',
 'application',
 'applications',
 'apps',
 'aquaculture',
 'architecture',
 'archiving',
 'art',
 'artificial',
 'artists',
 'assessment',
 'asset',
 'assisitive',
 'auctions',
 'audio',
 'augmented',
 'auto',
 'automated',
 'automation',
 'automotive',
 'b2b',
 'babies',
 'baby',
 'bananas',
 'banking',
 'based',
 'batteries',
 'beauty',
 'beer',
 'behavior',
 'benefits',
 'bicycles',
 'big',
 'billing',
 'bio-pharm',
 'bioinformatics',
 'biology',
 'biometrics',
 'biotechnology',
 'bitcoin',
 'blogging',
 