In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
%matplotlib inline
% matplotlib inline


pd.options.display.max_columns = 50

In [2]:
data = pd.read_csv('training_startup_data.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [3]:
training_data = data

In [4]:
base_logreg = LogisticRegression(random_state=42, max_iter= 10**4)

y_baseline = training_data['target']
X_baseline = training_data.drop('target', axis=1)

X_train , X_test, y_train, y_test = train_test_split(X_baseline, y_baseline, test_size = 0.20, random_state=42)


base_logreg.fit(X_train, y_train)

y_log_default_test = base_logreg.predict(X_test)
y_log_default_train = base_logreg.predict(X_train)

residuals1 = np.abs(y_train - y_log_default_train)
print('Training Data:')
print(pd.Series(residuals1).value_counts())
print(pd.Series(residuals1).value_counts(normalize=True))

Training Data:
0    12728
1     1650
Name: target, dtype: int64
0    0.885241
1    0.114759
Name: target, dtype: float64


In [5]:
trainrpt = print(classification_report(y_train, y_log_default_train))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12873
           1       0.09      0.01      0.02      1505

    accuracy                           0.89     14378
   macro avg       0.50      0.50      0.48     14378
weighted avg       0.81      0.89      0.84     14378



In [6]:
from sklearn.metrics import confusion_matrix

In [7]:
confusion_matrix(y_train, y_log_default_train)

array([[12711,   162],
       [ 1488,    17]])

# Dealing with class imbalance.

## Downsampling the majority class 

In [8]:
from sklearn.utils import resample

In [9]:
acquired = training_data[training_data.target ==1]
not_acquired = training_data[training_data.target ==0]

In [10]:
not_acquired_downsampled = resample(not_acquired,
                                replace = False, # sample without replacement
                                n_samples = (len(acquired)), # match minority n
                                random_state = 23) # reproducible results

In [11]:
df = pd.concat([acquired, not_acquired_downsampled])

In [12]:
df.target.value_counts()

1    1875
0    1875
Name: target, dtype: int64

# Scaling 

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
y = df['target']
X = df.drop('target', axis=1)
# feature_cols = X.columns

In [15]:
scaler = StandardScaler()

In [16]:
columns_to_scale = ['funding_total_usd','seed','venture','equity_crowdfunding',
  'undisclosed','convertible_note','debt_financing','angel',
  'grant','private_equity', 'round_A', 'round_B','days_from_founding_to_funding',
  'time_between_first_and_last_funding']

X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])



In [17]:
X

Unnamed: 0,funding_total_usd,seed,venture,equity_crowdfunding,undisclosed,convertible_note,debt_financing,angel,grant,private_equity,round_A,round_B,2.0,3d,accessories,accounting,active,ad,adherence,adults,advanced,adventure,advertising,advice,aerospace,...,url_ending_es,url_ending_fm,url_ending_htm,url_ending_html,url_ending_im,url_ending_in,url_ending_info,url_ending_io,url_ending_is,url_ending_it,url_ending_la,url_ending_ly,url_ending_me,url_ending_mobi,url_ending_net,url_ending_org,url_ending_other,url_ending_php,url_ending_se,url_ending_sh,url_ending_to,url_ending_tv,url_ending_uk,url_ending_us,url_ending_ws
20,-0.028917,-0.332930,0.002793,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,-0.287038,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24,0.013630,-0.332930,0.511461,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,0.325995,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28,-0.050473,-0.332930,-0.254913,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,-0.287038,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
38,-0.048663,-0.332930,-0.233281,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,-0.287038,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
44,-0.044902,-0.332930,-0.188319,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,0.325995,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11305,-0.048939,-0.332930,-0.236580,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,0.381736,-0.287038,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
15073,-0.020541,-0.332930,0.102934,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,-0.287038,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
433,-0.052976,-0.332930,-0.284841,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,-0.287038,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8191,-0.055398,-0.332930,-0.313797,-0.040828,-0.066568,-0.066983,-0.020124,-0.149496,-0.033859,-0.090996,-0.302956,-0.287038,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
import pickle

scaler_pickle_path = 'scaler_1.pkl'


scaler_pickle = open(scaler_pickle_path, 'wb')
pickle.dump(scaler, scaler_pickle)
scaler_pickle.close()


In [19]:
X_scaled_train , X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X, y, test_size = 0.20, random_state=42)

## Scaled Logistic regression

In [20]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=2000, 
                            l1_ratio=1.0
                            )


In [21]:
logReg.fit(X_scaled_train, y_scaled_train)

LogisticRegression(C=0.01, class_weight='balanced', l1_ratio=1.0, max_iter=2000,
                   penalty='elasticnet', solver='saga')

In [22]:
y_scaled_train_lr = logReg.predict(X_scaled_train)


print(classification_report(y_scaled_train, y_scaled_train_lr))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60      1515
           1       0.59      0.59      0.59      1485

    accuracy                           0.60      3000
   macro avg       0.60      0.60      0.60      3000
weighted avg       0.60      0.60      0.60      3000



In [23]:
y_scaled_test_lr = logReg.predict(X_scaled_test)


print(classification_report(y_scaled_test_lr,y_scaled_test))

              precision    recall  f1-score   support

           0       0.60      0.59      0.60       364
           1       0.62      0.63      0.62       386

    accuracy                           0.61       750
   macro avg       0.61      0.61      0.61       750
weighted avg       0.61      0.61      0.61       750



In [24]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(logReg, model_pickle)
model_pickle.close()

In [25]:
.feature_names

SyntaxError: invalid syntax (<ipython-input-25-02fd381d4edd>, line 1)

# Other Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators=100, max_depth=2, max_features=4)

In [None]:
rfc.fit(X_scaled_train, y_scaled_train)

In [None]:
rfc_preds = rfc.predict(X_scaled_test)

In [None]:
print(classification_report(y_scaled_test,rfc_preds))

In [None]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(rfc, model_pickle)
model_pickle.close()

In [None]:
######

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = { 
    'solver': ['saga'],
    'C':[.3, .5],
    'penalty':['elasticnet'],
    'max_iter':[200, 1000],
    'l1_ratio':[.2, 1.0]
               }

In [None]:
grid_tree=GridSearchCV(LogisticRegression(), param_grid, verbose=1, n_jobs=-1)

In [None]:
grid_tree.fit(X_scaled_train, y_scaled_train)

In [None]:
y_pred = grid_tree.best_estimator_.predict(X_scaled_test)

In [None]:
print(classification_report(y_scaled_test,y_pred))

In [None]:
y_pred_prob = grid_tree.predict_proba(X_test)[:, 1]
y_pred_class = np.where(y_pred_prob > 0.3, 1, 0)

In [None]:
print(classification_report(y_scaled_test,y_pred))

In [None]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(grid_tree.best_estimator_, model_pickle)
model_pickle.close()

In [None]:
grid_tree.best_params_