In [1]:
# import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
# import csvs
train_df = pd.read_csv('Resources/Generator/2019loans.csv')
test_df = pd.read_csv('Resources/Generator/2020Q1loans.csv')

In [3]:
# preview test data
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,24000.0,0.0819,754.18,MORTGAGE,153000.0,Not Verified,n,32.2,1.0,1.0,...,5.6,0.0,0.0,1088284.0,194953.0,141900.0,214684.0,N,N,low_risk
1,7350.0,0.0756,228.84,RENT,54538.0,Source Verified,n,20.44,0.0,1.0,...,0.0,1.0,0.0,79125.0,26233.0,18500.0,32425.0,N,N,low_risk
2,4000.0,0.0881,126.85,MORTGAGE,36578.0,Not Verified,n,4.49,1.0,1.0,...,0.0,0.0,0.0,108500.0,4153.0,21000.0,2500.0,N,N,low_risk
3,15000.0,0.1774,540.34,MORTGAGE,67000.0,Verified,n,23.41,0.0,1.0,...,66.7,0.0,0.0,217993.0,60216.0,12000.0,55822.0,N,N,low_risk
4,5250.0,0.0756,163.46,MORTGAGE,48000.0,Source Verified,n,15.83,0.0,0.0,...,0.0,1.0,0.0,56013.0,31600.0,11800.0,28713.0,N,N,low_risk


In [4]:
# separate target from predictive features
X_train = train_df.drop("target", axis=1).copy()
X_test = test_df.drop("target", axis=1).copy()
y_train = train_df["target"].copy()
y_test = test_df["target"].copy()

In [5]:
# convert categorical data to numeric
X_train_dummies = pd.get_dummies(X_train)
X_train_dummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,10000.0,0.1502,346.76,37000.0,29.39,1.0,0.0,9.0,0.0,5478.0,...,0,0,1,0,1,1,0,1,0,1
1,5000.0,0.1033,162.12,30000.0,28.44,0.0,0.0,8.0,0.0,15251.0,...,0,0,1,0,1,1,0,1,0,1
2,21275.0,0.1033,455.5,65000.0,25.81,0.0,0.0,12.0,0.0,18886.0,...,0,0,1,0,1,0,1,1,0,1
3,15950.0,0.2055,427.48,45000.0,33.2,0.0,0.0,10.0,0.0,16318.0,...,0,0,1,0,1,1,0,1,0,1
4,40000.0,0.0819,1256.97,130000.0,6.8,0.0,0.0,6.0,0.0,11669.0,...,1,0,1,0,1,0,1,1,0,1


In [6]:
# separate target feature for training data
X_test_dummies = pd.get_dummies(X_test)
X_test_dummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,24000.0,0.0819,754.18,153000.0,32.2,1.0,1.0,35.0,0.0,36233.0,...,0,0,1,0,1,0,1,1,0,1
1,7350.0,0.0756,228.84,54538.0,20.44,0.0,1.0,18.0,1.0,4433.0,...,1,0,1,1,0,1,0,1,0,1
2,4000.0,0.0881,126.85,36578.0,4.49,1.0,1.0,4.0,0.0,3132.0,...,0,0,1,0,1,1,0,1,0,1
3,15000.0,0.1774,540.34,67000.0,23.41,0.0,1.0,9.0,0.0,10388.0,...,0,1,1,0,1,1,0,1,0,1
4,5250.0,0.0756,163.46,48000.0,15.83,0.0,0.0,7.0,1.0,3662.0,...,1,0,1,0,1,1,0,1,0,1


In [7]:
# encode the target column with 1s and 0s (both train and test)
target_encoder = LabelEncoder().fit(y_train)
y_train = target_encoder.transform(y_train)
y_test = target_encoder.transform(y_test)

In [8]:
# Check the shapes to make sure X_train and X_test have the same number of columns (same with y_train/y_test)
# Check that X_train and y_train have the same number of rows (same with X_test/y_test)
print(X_train_dummies.shape)
print(X_test_dummies.shape)
print(y_train.shape)
print(y_test.shape)

(13314, 91)
(8146, 91)
(13314,)
(8146,)


In [9]:
# train the Logistic Regression model on the unscaled data
lr = LogisticRegression()
lr.fit(X_train_dummies, y_train)

# print the model train and test score
print(lr.score(X_test_dummies, y_test))
print(lr.score(X_train_dummies, y_train))

0.5001227596366314
0.6168694607180412


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Random Forest Classifier training model and test score
rf = RandomForestClassifier(n_estimators=200, max_depth=3)
rf.fit(X_train_dummies, y_train)
print(rf.score(X_test_dummies, y_test))
print(rf.score(X_train_dummies, y_train))

0.7014485637122514
0.7199939912873666


In [11]:
# create a scaler based on the X_train data
scaler = StandardScaler().fit(X_train_dummies)

# use the scaler on X_train and X_test
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

In [12]:
# Logistic Regression training model on scaled data; train/test score
lr.fit(X_train_scaled, y_train)
print(lr.score(X_test_scaled, y_test))
print(lr.score(X_train_scaled, y_train))

0.7533758900073656
0.7125582094036352


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# random Forest Classifier training model on the scaled data and print the model train/test score
rf.fit(X_train_scaled, y_train)
print(rf.score(X_test_scaled, y_test))
print(rf.score(X_train_scaled, y_train))

rf.feature_importances_

0.6922415909648908
0.7181913774973712


array([1.29890105e-02, 1.19357888e-01, 2.12773962e-02, 2.58826164e-03,
       7.18939744e-04, 9.80575333e-05, 2.80883189e-03, 2.46943681e-04,
       0.00000000e+00, 4.16858773e-03, 3.72005909e-04, 8.86433189e-02,
       9.96872992e-02, 1.39379537e-02, 2.24902252e-02, 5.97164033e-02,
       5.97334480e-02, 6.87739680e-02, 0.00000000e+00, 0.00000000e+00,
       2.32766082e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.66015702e-04, 2.33276758e-03, 3.31668886e-03, 1.44528028e-04,
       3.07666752e-04, 9.51036189e-04, 1.63234354e-03, 2.12570616e-04,
       4.54357693e-03, 5.79754925e-03, 6.94495203e-03, 1.35245651e-02,
       3.35844528e-03, 1.29361641e-02, 5.37464391e-04, 2.64162890e-04,
       3.16682732e-03, 1.08820643e-02, 1.66875353e-03, 1.64330672e-02,
       4.99919824e-04, 0.00000000e+00, 0.00000000e+00, 1.80774954e-03,
       1.21522339e-02, 3.94835477e-03, 1.41109238e-03, 4.85091894e-03,
       3.70421048e-03, 5.02103564e-03, 6.55333320e-04, 4.19781499e-04,
      

In [14]:
# fit a model, and then print a classification report
target_names = ["low_risk","high_risk"]

clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    low_risk       0.66      0.71      0.68      4073
   high_risk       0.69      0.63      0.66      4073

    accuracy                           0.67      8146
   macro avg       0.67      0.67      0.67      8146
weighted avg       0.67      0.67      0.67      8146

Training Score: 1.0
Testing Score: 0.6721090105573287


In [15]:
# fit Extremely Random Trees classifier

clf = ExtraTreesClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    low_risk       0.63      0.57      0.60      4073
   high_risk       0.61      0.66      0.63      4073

    accuracy                           0.62      8146
   macro avg       0.62      0.62      0.62      8146
weighted avg       0.62      0.62      0.62      8146

Training Score: 1.0
Testing Score: 0.6161306162533758


In [16]:
# let see what this one does
clf = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

    low_risk       0.63      0.57      0.60      4073
   high_risk       0.61      0.66      0.63      4073

    accuracy                           0.62      8146
   macro avg       0.62      0.62      0.62      8146
weighted avg       0.62      0.62      0.62      8146

Training Score: 0.7541685443893645
Testing Score: 0.6315983304689418


In [17]:
# initialize PCA model and fit to scaled training set
pca = PCA(.95)
pca.fit(X_train_scaled)

PCA(n_components=0.95)

In [18]:
# get four pcs for the  data
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [19]:
# apply Logistic Regression to the Transformed Data
lr = LogisticRegression()
lr.fit(X_train_pca, y_train)
print(f'Testing Score: {lr.score(X_test_pca, y_test)}')
print(f'Training Score: {lr.score(X_train_pca, y_train)}')

Testing Score: 0.6241099926344218
Training Score: 0.6603575184016824
