In [29]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import eli5
import pickle

sns.set()
warnings.filterwarnings('ignore')

In [30]:
from sklearn.feature_selection import mutual_info_regression, SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_absolute_percentage_error, mean_squared_error, roc_auc_score, log_loss, precision_recall_fscore_support, mean_absolute_error, plot_roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from imblearn.over_sampling import RandomOverSampler, SMOTE
from eli5.sklearn import PermutationImportance
from pprint import pprint

from xgboost import XGBRegressor, XGBClassifier

---

In [31]:
pd.set_option('display.max_columns', None)

---

**loading data + X, y split**

In [32]:
path = '../crunchbase/clean_no_state_int.csv'
df = pd.read_csv(path)

X = df.copy()
y = df[['status', 'isClosed', 'active_days']]
yStatus = X.pop('status')
yClosed = X.pop('isClosed')
yActive = X.pop('active_days')
X.columns

Index(['founded_at', 'investment_rounds', 'first_funding_at',
       'last_funding_at', 'funding_rounds', 'funding_total_usd',
       'first_milestone_at', 'last_milestone_at', 'milestones',
       'relationships', 'lat', 'lng', 'advertising', 'biotech', 'consulting',
       'ecommerce', 'education', 'enterprise', 'games_video', 'hardware',
       'mobile', 'network_hosting', 'other', 'public_relations', 'search',
       'software', 'web', 'AUS', 'CAN', 'DEU', 'ESP', 'FRA', 'GBR', 'IND',
       'ISR', 'NLD', 'OTHER', 'USA'],
      dtype='object')

In [33]:
X = X[[
    'founded_at', 'first_funding_at', 'last_funding_at', 'funding_rounds', 'funding_total_usd', 'first_milestone_at', 'last_milestone_at', 'milestones', 'relationships'
]]
X.columns

Index(['founded_at', 'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'first_milestone_at', 'last_milestone_at',
       'milestones', 'relationships'],
      dtype='object')

In [34]:
closed = yClosed.value_counts(); print(closed)
status = yStatus.value_counts(); print(status)

1    58090
0     5526
Name: isClosed, dtype: int64
0    57478
1     3897
2     1629
3      612
Name: status, dtype: int64


---

**train, test split**

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

yStatus_train = y_train.iloc[:,0]
yClosed_train = y_train.iloc[:,1]
yActive_train = y_train.iloc[:,2]

yStatus_test = y_test.iloc[:,0]
yClosed_test = y_test.iloc[:,1]
yActive_test = y_test.iloc[:,2]

---

## data preprocessing

In [36]:
over = RandomOverSampler(random_state=0)
smote = SMOTE()

stdscaler = StandardScaler()
scaler = MinMaxScaler()

pca = PCA(n_components=2)

---

## classification

**ensemble learning**

In [37]:
qda = QuadraticDiscriminantAnalysis()

rf = RandomForestClassifier(random_state=0)

In [38]:
estimators = [('QDA', qda), ('RandomForest', rf)]
final_estimator = GradientBoostingClassifier()
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [39]:
model_ensemble = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', clf)
])

In [40]:
model_ensemble.fit(X_train, yStatus_train)
pred = model_ensemble.predict(X_test)
print('test accuracy = ', round(accuracy_score(yStatus_test, pred)*100, 2), '%')

test accuracy =  90.02 %


In [41]:
print(classification_report(yStatus_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.902     0.998     0.948     11465
           1      0.435     0.013     0.024       797
           2      0.000     0.000     0.000       343
           3      0.259     0.059     0.096       119

    accuracy                          0.900     12724
   macro avg      0.399     0.267     0.267     12724
weighted avg      0.843     0.900     0.856     12724



**QDA**

In [42]:
model_qda = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', qda)
])

In [43]:
model_qda.fit(X_train, yClosed_train)
pred = model_qda.predict(X_test)
print('test accuracy = ', round(accuracy_score(yClosed_test, pred)*100, 2), '%')

test accuracy =  89.84 %


In [44]:
print(classification_report(yClosed_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.310     0.110     0.162      1140
           1      0.918     0.976     0.946     11584

    accuracy                          0.898     12724
   macro avg      0.614     0.543     0.554     12724
weighted avg      0.863     0.898     0.876     12724



**Random Forest**

In [45]:
model_rf = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', rf)
])

In [46]:
model_rf.fit(X_train, yStatus_train)
pred = model_rf.predict(X_test)
print('test accuracy = ', round(accuracy_score(yStatus_test, pred)*100, 2), '%')

test accuracy =  88.68 %


In [47]:
print(classification_report(yClosed_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.082     0.883     0.150      1140
           1      0.648     0.015     0.030     11584
           2      0.000     0.000     0.000         0
           3      0.000     0.000     0.000         0

    accuracy                          0.093     12724
   macro avg      0.183     0.225     0.045     12724
weighted avg      0.597     0.093     0.040     12724



---

## saving model

In [48]:
pickle.dump(model_ensemble, open('../models/ensemble.pkl', 'wb'))
pickle.dump(model_qda, open('../models/qda.pkl', 'wb'))
pickle.dump(model_rf, open('../models/rf.pkl', 'wb'))

In [49]:
# to load the model :
# pickled_model = pickle.load(open('model.pkl', 'rb'))
# pickled_model.predict(X_test)

---