# Imports

In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics

from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import TomekLinks

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

# Dataset Prep

In [2]:
data = pd.read_csv('cleaned_dummied_startup_data.csv')
data.drop(['Unnamed: 0', 'founded_at', 'last_funding_at', 'first_funding_at'], axis=1, inplace=True)

In [3]:
status = []

for val in data['status']:
    if val == 'acquired':
        status.append(0)
    elif val == 'operating':
        status.append(1)
    else:
        status.append(2)
data['target'] = status        

data.drop(['status'], axis=1, inplace=True)

In [4]:
training_data = data

In [5]:
training_data = training_data.loc[training_data['funding_total_usd'] != ' -   ']

In [6]:
training_data.funding_total_usd = training_data['funding_total_usd'].astype(int)

# Dealing with Class Imbalance 

In [7]:
data.target.value_counts()

1    19726
0     2309
2     1203
Name: target, dtype: int64

## Upsampling

### Closed

In [8]:
closed = training_data[training_data.target == 2]
acquired_operating = training_data[training_data.target != 2]

In [9]:
closed_upsampled = resample(closed,
                              replace=True, 
                              n_samples=3000, 
                              random_state=42) 

In [10]:
closed_upsampled.shape

(3000, 950)

In [11]:
upsampled_closed = pd.concat([acquired_operating, closed_upsampled])

In [12]:
upsampled_closed.shape

(21939, 950)

In [13]:
upsampled_closed.target.value_counts()

1    16814
2     3000
0     2125
Name: target, dtype: int64

In [14]:
acquired = upsampled_closed[upsampled_closed.target == 0]
closed_operating = upsampled_closed[upsampled_closed.target != 0]

In [15]:
acquired.shape

(2125, 950)

In [16]:
closed_operating.shape

(19814, 950)

In [17]:
acquired_upsampled = resample(acquired,
                              replace=True, 
                              n_samples=3000, 
                              random_state=42) 

In [18]:
acquired_upsampled.shape

(3000, 950)

In [19]:
upsampled_both = pd.concat([closed_operating, acquired_upsampled])

In [20]:
upsampled_both.shape

(22814, 950)

In [21]:
upsampled_both.target.value_counts()

1    16814
2     3000
0     3000
Name: target, dtype: int64

## Downsampling

### Operating

#### Resample

In [22]:
operating = upsampled_both[upsampled_both.target == 1]
closed_acquired = upsampled_both[upsampled_both.target != 1]

In [23]:
operating_downsampled = resample(operating,
                                 replace=True, 
                                 n_samples=5000, 
                                 random_state=42) 

In [24]:
balanced = pd.concat([closed_acquired, operating_downsampled])

#### Test Train Split

In [25]:
y_upsampled = balanced['target']
X_upsampled = balanced.drop('target', axis=1)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.20, random_state=23)



#### Tomek Links

In [27]:
X_down, y_down = TomekLinks().fit_resample(X_train, y_train)

In [28]:
y_down.value_counts()

1    3728
2    2400
0    2205
Name: target, dtype: int64

# Logistic Regression

In [29]:
lr_balanced = LogisticRegression(solver='liblinear')

In [30]:
balanced = lr_balanced.fit(X_down, y_down)

In [31]:
y_pred = balanced.predict(X_test)

In [32]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.50      0.24      0.32       591
           1       0.50      0.89      0.64      1009
           2       0.60      0.12      0.20       600

    accuracy                           0.50      2200
   macro avg       0.53      0.42      0.39      2200
weighted avg       0.53      0.50      0.43      2200



In [33]:
metrics.precision_score(y_test,y_pred, average='micro')

0.5031818181818182

In [34]:
# Random Forest

In [35]:
rf = RandomForestClassifier(n_jobs=-1)

In [52]:
param_grid = {  
    'n_estimators': [100,200,300],
    'criterion': ['gini','entropy'],
    'max_depth': list(range(3,9)),
    'max_features': list(range(100,1000,20)),
    'random_state': [5,10,15,20,42],
    'min_samples_leaf': [1,2,3,4,5]
}

In [53]:
rand_grid_tree = RandomizedSearchCV(rf, 
                                    param_grid, 
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-1
)

In [54]:
rand_grid_tree.fit(X_down, y_down)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.0min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 4, 5, 6, 7, 8],
                                        'max_features': [100, 120, 140, 160,
                                                         180, 200, 220, 240,
                                                         260, 280, 300, 320,
                                                         340, 360, 380, 400,
                                                         420, 440, 460, 480,
                                                         500, 520, 540, 560,
                                                         580, 600, 620, 640,
                                                         660, 680, ...],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'n_estimators': [100, 200, 300],
          

In [39]:
y_pred_rf = rand_grid_tree.predict(X_test)

In [40]:
metrics.precision_score(y_test, y_pred_rf, average='micro')

0.6040909090909091

# XG Boost

In [50]:
xg_clf = xgb.XGBClassifier(objective ='multi:softmax', 
                           silent=1
                           alpha = 1
            )

SyntaxError: invalid syntax (<ipython-input-50-4110bbf1607c>, line 3)

In [51]:
param_grid_xgb = {'n_estimators': [200,300,400,500,600],
                  'colsample_bytree': [0.4,0.5,0.6],
                  'subsample': [0.4,0.5,0.6],
                  'min_samples_leaf': [1,2,3,4,5],
                  'max_depth' : list(range(3,7))
                  'learning_rate' : list(range(0.1,0.5,0.1))
}

SyntaxError: invalid syntax (<ipython-input-51-36a476b9937b>, line 6)