In [112]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
%matplotlib inline
% matplotlib inline


pd.options.display.max_columns = 50

In [113]:
data = pd.read_csv('training_startup_data.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [62]:
training_data = data

In [4]:
base_logreg = LogisticRegression(random_state=42, max_iter= 10**4)

y_baseline = training_data['target']
X_baseline = training_data.drop('target', axis=1)

X_train , X_test, y_train, y_test = train_test_split(X_baseline, y_baseline, test_size = 0.20, random_state=42)


base_logreg.fit(X_train, y_train)

y_log_default_test = base_logreg.predict(X_test)
y_log_default_train = base_logreg.predict(X_train)

residuals1 = np.abs(y_train - y_log_default_train)
print('Training Data:')
print(pd.Series(residuals1).value_counts())
print(pd.Series(residuals1).value_counts(normalize=True))

Training Data:
0    11993
1     2370
2       15
Name: target, dtype: int64
0    0.834122
1    0.164835
2    0.001043
Name: target, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [6]:
trainrpt = print(classification_report(y_train, y_log_default_train))

              precision    recall  f1-score   support

           0       0.10      0.01      0.02      1505
           1       0.84      0.99      0.91     12123
           2       0.16      0.01      0.01       750

    accuracy                           0.83     14378
   macro avg       0.37      0.33      0.31     14378
weighted avg       0.73      0.83      0.77     14378



In [7]:
from sklearn.metrics import confusion_matrix

In [8]:
confusion_matrix(y_train, y_log_default_train)

array([[   15,  1489,     1],
       [  124, 11973,    26],
       [   14,   731,     5]])

# Dealing with class imbalance.

## Downsampling the majority class 

In [63]:
from sklearn.utils import resample

In [64]:
acquired = training_data[training_data.target ==0]
operating = training_data[training_data.target ==1]
closed = training_data[training_data.target ==2]

In [65]:
operating_downsampled = resample(operating,
                                replace = False, # sample without replacement
                                n_samples = (len(acquired)+150), # match minority n
                                random_state = 23) # reproducible results

In [66]:
df = pd.concat([operating_downsampled, acquired, closed])

In [67]:
df.target.value_counts()

1    2025
0    1875
2     920
Name: target, dtype: int64

## Smote

In [68]:
from imblearn.over_sampling import SMOTE

In [69]:
y = df['target']
X = df.drop('target', axis=1)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [71]:
sm = SMOTE(random_state=23)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [72]:
y_train.value_counts()

2    1497
1    1497
0    1497
Name: target, dtype: int64

In [73]:
smote_lr = LogisticRegression(solver='saga')

smote_lr.fit(X_train, y_train)

smote_pred = smote_lr.predict(X_test)



In [74]:
trainrpt = print(classification_report(y_test, smote_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.54      0.55      0.54       459
           1       0.50      0.64      0.56       528
           2       0.17      0.04      0.07       218

    accuracy                           0.50      1205
   macro avg       0.40      0.41      0.39      1205
weighted avg       0.45      0.50      0.46      1205



# Scaling 

In [114]:
from sklearn.preprocessing import StandardScaler

In [115]:
y = df['target']
X = df.drop('target', axis=1)
# feature_cols = X.columns

In [116]:
scaler = StandardScaler()

scaled_X = scaler.fit_transform(X)

In [118]:




X[['funding_total_usd',
   'seed', 
   'equity_crowdfunding', 
   'undisclosed', 
   'convertible_note',
   'debt_financing',
   'angel',
   'grant', 
   'private_equity',
   'days_from_founding_to_funding',
   'time_between_first_and_last_funding', 
   'round_A', 
   'round_B']] = scaler.fit_transform([[
    'funding_total_usd',
   'seed', 
   'equity_crowdfunding', 
   'undisclosed', 
   'convertible_note',
   'debt_financing',
   'angel',
   'grant', 
   'private_equity',
   'days_from_founding_to_funding',
   'time_between_first_and_last_funding', 
   'round_A', 
   'round_B']])

ValueError: could not convert string to float: 'funding_total_usd'

In [122]:
X_train.info().to_list()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4491 entries, 0 to 4490
Columns: 949 entries, funding_total_usd to url_ending_ws
dtypes: float64(11), int64(938)
memory usage: 32.5 MB


AttributeError: 'NoneType' object has no attribute 'to_list'

In [25]:
# import pickle

# scaler_pickle_path = 'scaler_1.pkl'


# scaler_pickle = open(scaler_pickle_path, 'wb')
# pickle.dump(scaler, scaler_pickle)
# scaler_pickle.close()


In [26]:
X_scaled_train , X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X, y, test_size = 0.20, random_state=42)

## Scaled Logistic regression

In [94]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=2000, 
                            l1_ratio=1.0
                            )


In [95]:
logReg.fit(X_scaled_train, y_scaled_train)



LogisticRegression(C=0.01, class_weight='balanced', l1_ratio=1.0, max_iter=2000,
                   penalty='elasticnet', solver='saga')

In [96]:
y_scaled_train_lr = logReg.predict(X_scaled_train)


print(classification_report(y_scaled_train, y_scaled_train_lr))

              precision    recall  f1-score   support

           0       0.48      0.76      0.59      1517
           1       0.52      0.36      0.42      1609
           2       0.33      0.16      0.21       730

    accuracy                           0.48      3856
   macro avg       0.44      0.43      0.41      3856
weighted avg       0.47      0.48      0.45      3856



In [97]:
y_scaled_test_lr = logReg.predict(X_scaled_test)


print(classification_report(y_scaled_test, y_scaled_test_lr))

              precision    recall  f1-score   support

           0       0.46      0.77      0.58       358
           1       0.50      0.34      0.41       416
           2       0.30      0.14      0.19       190

    accuracy                           0.46       964
   macro avg       0.42      0.42      0.39       964
weighted avg       0.45      0.46      0.43       964



In [98]:
import pickle
model_pickle_path = 'boost_model_bf_1.pkl'


model_pickle = open(model_pickle_path, 'wb')
pickle.dump(logReg, model_pickle)
model_pickle.close()

In [None]:
.feature_names