# Imports

In [10]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100

import sklearn
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import metrics

from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

# Baseline Model

In [None]:
data = pd.read_csv('training_startup_data.csv', index_col=[0])

In [4]:
training_data = data

In [5]:
base_logreg = LogisticRegression(random_state=42, max_iter= 10**4)

y_baseline = training_data['target']
X_baseline = training_data.drop('target', axis=1)

X_train , X_test, y_train, y_test = train_test_split(X_baseline, y_baseline, test_size = 0.20, random_state=42)


base_logreg.fit(X_train, y_train)

y_log_default_test = base_logreg.predict(X_test)
y_log_default_train = base_logreg.predict(X_train)

residuals1 = np.abs(y_train - y_log_default_train)
print('Training Data:')
print(pd.Series(residuals1).value_counts())
print(pd.Series(residuals1).value_counts(normalize=True))

Training Data:
0    12739
1     1639
Name: target, dtype: int64
0    0.886006
1    0.113994
Name: target, dtype: float64


In [6]:
trainrpt = print(classification_report(y_train, y_log_default_train))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12873
           1       0.10      0.01      0.02      1505

    accuracy                           0.89     14378
   macro avg       0.50      0.50      0.48     14378
weighted avg       0.81      0.89      0.84     14378



In [8]:
confusion_matrix(y_train, y_log_default_train)

array([[12722,   151],
       [ 1488,    17]])

# Model Sequence 1

## Dealing With Class Imbalance

### Downsampling Majority Class

In [None]:
acquired = training_data[training_data.target ==1]
not_acquired = training_data[training_data.target ==0]

In [None]:
not_acquired_downsampled = resample(not_acquired,
                                replace = False, 
                                n_samples = (len(acquired)), 
                                random_state = 23) 

In [None]:
df = pd.concat([acquired, not_acquired_downsampled])

In [None]:
df.target.value_counts()

## Scaling

In [None]:
y = df['target']
X = df.drop('target', axis=1)

In [None]:
scaler = StandardScaler()

In [None]:
columns_to_scale = ['funding_total_usd',
                    'seed',
                    'venture',
                    'equity_crowdfunding',
                    'undisclosed',
                    'convertible_note',
                    'debt_financing',
                    'angel',
                    'grant',
                    'private_equity', 
                    'round_A', 
                    'round_B',
                    'days_from_founding_to_funding',
                    'time_between_first_and_last_funding']

X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

### Pickling Scalar

In [None]:
import pickle

scaler_pickle_path = 'scaler_1.pkl'


scaler_pickle = open(scaler_pickle_path, 'wb')
pickle.dump(scaler, scaler_pickle)
scaler_pickle.close()

## Test Train Split

In [None]:
X_scaled_train , X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X, y, test_size = 0.20, random_state=42)

## Logistic Regression

In [None]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=2000, 
                            l1_ratio=1.0
                            )


In [None]:
logReg.fit(X_scaled_train, y_scaled_train)

In [None]:
y_scaled_train_lr = logReg.predict(X_scaled_train)


print(classification_report(y_scaled_train, y_scaled_train_lr))

In [None]:
y_scaled_test_lr = logReg.predict(X_scaled_test)


print(classification_report(y_scaled_test_lr,y_scaled_test))

### Pickling Logistic Regression Model

In [None]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(logReg, model_pickle)
model_pickle.close()

## GridSearch Logistic Regression

In [None]:
param_grid = { 
    'solver': ['saga'],
    'C':[.3, .5],
    'penalty':['elasticnet'],
    'max_iter':[200, 1000],
    'l1_ratio':[.2, 1.0]
               }

In [None]:
grid_tree=GridSearchCV(LogisticRegression(), param_grid, verbose=1, n_jobs=-1)

In [None]:
grid_tree.fit(X_scaled_train, y_scaled_train)

In [None]:
y_pred = grid_tree.best_estimator_.predict(X_scaled_test)

In [None]:
print(classification_report(y_scaled_test,y_pred))

In [None]:
y_pred_prob = grid_tree.predict_proba(X_test)[:, 1]
y_pred_class = np.where(y_pred_prob > 0.3, 1, 0)

In [None]:
print(classification_report(y_scaled_test,y_pred))

### Pickling GridSearch Logistic Regression

In [None]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(grid_tree.best_estimator_, model_pickle)
model_pickle.close()

# Model Sequence 2

In [None]:
data_2 = pd.read_csv('training_startup_data.csv', index_col=[0])

In [None]:
X = data_2.drop(columns = 'target')
y = data_2.target

## Dealing With Class Imbalance

### Upsampling Acquired

In [None]:
acquired = data[data.target == 1]
not_acquired = data[data.target != 1]

In [None]:
acquired_upsampled = resample(acquired,
                              replace=True, 
                              n_samples=5000, 
                              random_state=42) 

In [None]:
upsampled_data = pd.concat([not_acquired, acquired_upsampled])

### Downsampling Not Acquired

#### Resample

In [None]:
not_acquired_down = upsampled_data[upsampled_data.target != 1]
acquired_down = upsampled_data[upsampled_data.target == 1]

In [None]:
not_acquired_downsampled = resample(not_acquired_down,
                                 replace=True, 
                                 n_samples=10000, 
                                 random_state=42) 

In [None]:
balanced = pd.concat([acquired_down, not_acquired_downsampled])

#### Tomek Links

##### Test Train Split

In [None]:
y_res = balanced['target']
X_res = balanced.drop('target', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.20, random_state=23)

##### Resample

In [None]:
X_bal, y_bal = TomekLinks().fit_resample(X_train, y_train)

## Logistc Regression

### Scaling

In [None]:
scaler_2 = StandardScalar()

X_scale = X_bal

X_scale[columns_to_scale] = scaler_2.fit_transform(X_scale[columns_to_scale])

### Grid Search

In [None]:
lr = LogisticRegression()

In [None]:
param_grid_lr = {
    
    'class_weight': [None,'balanced'],
    'solver': ['liblinear','sag','saga'],
    'max_iter': list(range(100,1000,25)) 
}

In [None]:
lr_grid = RandomizedSearchCV(lr, 
                             param_grid_lr, 
                             scoring='precision', 
                             n_jobs=-1, 
                             verbose=2)

In [None]:
lr.fit(X_bal, y_bal)

In [None]:
y_pred_lr = lr.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_lr))

In [None]:
metrics.precision_score(y_test,y_pred_lr)

In [None]:
THRESHOLD = 0.95
thesh_preds = np.where(lr.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

In [None]:
metrics.precision_score(y_test,thesh_preds)

## Random Forest

### Iteration 1

In [None]:
rf = RandomForestClassifier(n_jobs=-1)

In [None]:
param_grid_rf = {  
    'n_estimators': [100,200,300],
    'criterion': ['gini','entropy'],
    'max_depth': list(range(5,7)),
    'max_features': list(range(100,1000,100)),
    'min_samples_leaf': [1,2,3,4,5]
}

In [None]:
grid_tree_rf = RandomizedSearchCV(rf, 
                                  param_grid_rf, 
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-1,
                                  scoring='precision_micro'
)

In [None]:
grid_tree_rf.fit(X_bal, y_bal)

In [None]:
grid_tree_rf.best_params_

In [None]:
grid_tree_rf.best_score_

In [None]:
y_pred_rf_1 = grid_tree_rf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_rf_1))

### Second Iteration

In [None]:
rf_2 = RandomForestClassifier(n_jobs=-1, min_samples_leaf = 1, criterion = 'entropy')

In [None]:
param_grid_rf_2 = {  
    'n_estimators': list(range(150,800, 50)),
    'max_depth': list(range(3,8)),
    'max_features': list(range(200,600,25)),
}

In [None]:
grid_tree_rf_2 = RandomizedSearchCV(rf_2, 
                                  param_grid_rf_2, 
                                  cv=10,
                                  verbose=2,
                                  n_jobs=-1,
                                  scoring='precision'
)

In [None]:
grid_tree_rf_2.fit(X_bal, y_bal)

In [None]:
grid_tree_rf_2.best_score_

In [None]:
grid_tree_rf_2.best_params_

In [None]:
y_pred_rf_2 = grid_tree_rf_2.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_rf_2))