In [83]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
%matplotlib inline
% matplotlib inline


pd.options.display.max_columns = 50

In [84]:
data = pd.read_csv('cleaned_dummied_startup_data.csv')
data.drop(['Unnamed: 0', 'founded_at', 'last_funding_at', 'first_funding_at'], axis=1, inplace=True)

In [85]:
status = []

for val in data['status']:
    if val == 'acquired':
        status.append(0)
    elif val == 'operating':
        status.append(1)
    else:
        status.append(2)
data['target'] = status        

data.drop(['status'], axis=1, inplace=True)

In [86]:
training_data = data

In [87]:
training_data = training_data.loc[training_data['funding_total_usd'] != ' -   ']

In [88]:
training_data.funding_total_usd = training_data['funding_total_usd'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [100]:
base_logreg = LogisticRegression(random_state=42, max_iter= 10**4)

y_baseline = training_data['target']
X_baseline = training_data.drop('target', axis=1)

X_train , X_test, y_train, y_test = train_test_split(X_baseline, y_baseline, test_size = 0.20, random_state=42)


base_logreg.fit(X_train, y_train)

y_log_default_test = base_logreg.predict(X_test)
y_log_default_train = base_logreg.predict(X_train)

residuals1 = np.abs(y_train - y_log_default_train)
print('Training Data:')
print(pd.Series(residuals1).value_counts())
print(pd.Series(residuals1).value_counts(normalize=True))

Training Data:
0    13356
1     2603
2       17
Name: target, dtype: int64
0    0.836004
1    0.162932
2    0.001064
Name: target, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [101]:
trainrpt = print(classification_report(y_train, y_log_default_train))

              precision    recall  f1-score   support

           0       0.10      0.01      0.02      1671
           1       0.85      0.99      0.91     13504
           2       0.09      0.00      0.00       801

    accuracy                           0.84     15976
   macro avg       0.34      0.33      0.31     15976
weighted avg       0.73      0.84      0.77     15976



In [102]:
from sklearn.metrics import confusion_matrix

In [103]:
confusion_matrix(y_train, y_log_default_train)

array([[   18,  1652,     1],
       [  149, 13336,    19],
       [   16,   783,     2]])

# Dealing with class imbalance.

## Downsampling the majority class 

In [121]:
from sklearn.utils import resample

In [123]:
acquired = training_data[training_data.target ==0]
operating = training_data[training_data.target ==1]
closed = training_data[training_data.target ==2]

In [127]:
operating_downsampled = resample(operating,
                                replace = False, # sample without replacement
                                n_samples = (len(acquired)+150), # match minority n
                                random_state = 23) # reproducible results

In [130]:
df = pd.concat([operating_downsampled, acquired, closed])

In [132]:
df.target.value_counts()

1    2275
0    2125
2    1031
Name: target, dtype: int64

## Smote

In [133]:
from imblearn.over_sampling import SMOTE

In [134]:
y = df['target']
X = df.drop('target', axis=1)

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [136]:
sm = SMOTE(random_state=23)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [137]:
y_train.value_counts()

2    1695
1    1695
0    1695
Name: target, dtype: int64

In [142]:
smote_lr = LogisticRegression(solver='saga')

smote_lr.fit(X_train, y_train)

smote_pred = smote_lr.predict(X_test)



In [145]:
trainrpt = print(classification_report(y_test, smote_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.45      0.84      0.59       530
           1       0.48      0.31      0.38       580
           2       0.00      0.00      0.00       248

    accuracy                           0.46      1358
   macro avg       0.31      0.38      0.32      1358
weighted avg       0.38      0.46      0.39      1358



# Scaling 

In [146]:
from sklearn.preprocessing import StandardScaler

In [147]:
y = df['target']
X = df.drop('target', axis=1)

In [148]:
scaler = StandardScaler()

scaled_X = scaler.fit_transform(X)

In [149]:
X_scaled_train , X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(scaled_X, y, test_size = 0.20, random_state=42)

In [154]:
feature_cols = X.columns

## Scaled Logistic regression

In [150]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=1000, 
                            l1_ratio=1.0
                            )


In [151]:
logReg.fit(X_scaled_train, y_scaled_train)

LogisticRegression(C=0.01, class_weight='balanced', l1_ratio=1.0, max_iter=1000,
                   penalty='elasticnet', solver='saga')

In [152]:
y_scaled_train_lr = logReg.predict(X_scaled_train)


print(classification_report(y_scaled_train, y_scaled_train_lr))

              precision    recall  f1-score   support

           0       0.58      0.55      0.56      1724
           1       0.71      0.43      0.53      1800
           2       0.34      0.67      0.45       820

    accuracy                           0.52      4344
   macro avg       0.54      0.55      0.52      4344
weighted avg       0.59      0.52      0.53      4344



In [171]:
y_scaled_test_lr = logReg.predict(X_scaled_test)


print(classification_report(y_scaled_test, y_scaled_test_lr))

              precision    recall  f1-score   support

           0       0.54      0.52      0.53       401
           1       0.71      0.44      0.54       475
           2       0.33      0.62      0.43       211

    accuracy                           0.51      1087
   macro avg       0.52      0.53      0.50      1087
weighted avg       0.57      0.51      0.52      1087



# XGBoost

In [153]:
import xgboost as xgb

In [156]:
xgb.XGBClassifier()

XGBClassifier()

In [190]:
xg_clf = xgb.XGBClassifier(objective ='multi:softmax', 
                           colsample_bytree = 0.5, 
                           subsample = 0.5,
                           learning_rate = 0.1,
                           max_depth = 4, 
                           alpha = 1,
                           silent=1,
                           #scale_pos_weight= titanic['Survived'].mean(),
                           n_estimators = 500)

In [191]:
xg_clf.fit(X_scaled_train, y_scaled_train)

XGBClassifier(alpha=1, colsample_bytree=0.5, max_depth=4, n_estimators=500,
              objective='multi:softprob', silent=1, subsample=0.5)

In [192]:
train_preds = xg_clf.predict(X_scaled_train)

In [193]:
print(classification_report(y_scaled_train, train_preds))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1724
           1       0.88      0.87      0.87      1800
           2       0.83      0.72      0.77       820

    accuracy                           0.85      4344
   macro avg       0.85      0.83      0.84      4344
weighted avg       0.85      0.85      0.85      4344



In [194]:
test_preds = xg_clf.predict(X_scaled_test)

In [195]:
print(classification_report(y_scaled_test, test_preds))

              precision    recall  f1-score   support

           0       0.61      0.68      0.64       401
           1       0.68      0.70      0.69       475
           2       0.52      0.36      0.42       211

    accuracy                           0.63      1087
   macro avg       0.60      0.58      0.59      1087
weighted avg       0.62      0.63      0.62      1087



In [197]:
import graphviz
from xgboost import plot_tree

plot_tree(xg_clf)

ModuleNotFoundError: No module named 'graphviz'

In [None]:
!conda install graphviz

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: done


  current version: 4.8.5
  latest version: 4.9.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /Users/brendanferris/opt/anaconda3/envs/learn-env

  added / updated specs:
    - graphviz


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.10.14 |                0         121 KB
    cairo-1.14.12              |       hc4e6be7_4         860 KB
    certifi-2020.6.20          |     pyhd3eb1b0_3         155 KB
    fontconfig-2.13.0          |       h5d5b041_1         202 KB


In [None]:
!y