In [212]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder


In [213]:
train_df = pd.read_csv(Path('../Resources/2019loans.csv'))
test_df = pd.read_csv(Path('../Resources/2020Q1loans.csv'))
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.4,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.143,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.143,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.7,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.5,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk


In [214]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [215]:
y_train = train_df['target']
y_test = test_df['target']

In [216]:
# Convert categorical data to numeric and separate target feature for training data
# add missing dummy variables to training set
##stackoverflow https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data

X_train = pd.get_dummies(train_df.drop('target', axis=1))
X_train = X_train.reindex(columns = X_train.columns, fill_value=0)
X_train.head(10)

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0
5,6000.0,0.0756,186.81,90500.0,23.48,0.0,0.0,9.0,0.0,7092.0,...,0,1,0,1,1,0,1,0,1,0
6,10500.0,0.225,293.0,32000.0,16.09,0.0,1.0,10.0,0.0,2269.0,...,1,1,0,1,1,0,1,0,1,0
7,22000.0,0.0819,448.09,92000.0,22.87,0.0,0.0,17.0,0.0,26020.0,...,0,1,0,1,1,0,1,0,1,0
8,10625.0,0.2055,284.76,54100.0,15.53,0.0,2.0,8.0,0.0,9371.0,...,0,1,0,1,0,1,1,0,1,0
9,20000.0,0.2534,591.02,38000.0,19.68,0.0,0.0,4.0,0.0,20770.0,...,0,1,0,1,1,0,1,0,1,0


In [217]:
# Convert categorical data to numeric and separate target feature for testing data
# add missing dummy variables to testing set
X_test = pd.get_dummies(test_df.drop('target', axis=1))
X_test = X_test.reindex(columns = X_test.columns, fill_value=0)
X_test.head(10)



Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1
5,15000.0,0.1102,491.23,45000.0,66.32,0.0,0.0,8.0,0.0,6647.0,...,1,0,1,0,1,0,1,1,0,1
6,12000.0,0.0702,370.64,81500.0,16.61,0.0,0.0,13.0,1.0,10395.0,...,1,0,1,0,1,1,0,1,0,1
7,4500.0,0.1308,151.8,95000.0,24.36,0.0,0.0,21.0,0.0,34540.0,...,1,0,1,0,1,1,0,1,0,1
8,8000.0,0.1033,259.38,50000.0,13.18,0.0,0.0,4.0,0.0,14519.0,...,0,0,1,0,1,1,0,1,0,1
9,40000.0,0.0646,1225.24,140000.0,16.38,0.0,0.0,21.0,0.0,49666.0,...,1,0,1,0,1,1,0,1,0,1


In [218]:
# Converting output labels to 0 and 1
y_label = LabelEncoder().fit_transform(test_df['target'])
y_label

array([1, 1, 1, ..., 0, 0, 0])

In [219]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.4,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.143,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.143,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.7,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.5,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk


Create Models 

In [220]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver= "lbfgs", max_iter=25000)
classifier

LogisticRegression(max_iter=25000)

In [221]:
classifier.fit(X_train, y_train)


LogisticRegression(max_iter=25000)

In [222]:
#Print Scores for Logistic Regression Models test and train
for col in X_train.columns:
    if col not in X_test.columns: 
        X_test[col] = 0 
print(f"Testing Score: {classifier.score(X_test, y_test)}")
print(f"Training Score: {classifier.score(X_train, y_train)}")

Testing Score: 0.5616758826031476
Training Score: 0.707471264367816


In [232]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Testing Score: {clf.score(X_test, y_test)}')
print(f'Training Score: {clf.score(X_train, y_train)}')


Testing Score: 0.646958740961293
Training Score: 1.0


Prediction 
Based on initial training and test scores, I believe the Random Forest Classifier will be a more reliable model prior to scaling the data.

In [233]:
 # Scale the data
from sklearn.preprocessing import StandardScaler
clf_scaler = StandardScaler().fit(X_train)
X_selected_train_scaled = clf_scaler.transform(X_train)
X_selected_test_scaled = clf_scaler.transform(X_test)

In [234]:
# Train the Logistic Regression model on the scaled data and print the model score



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [226]:
# Train a Random Forest Classifier model on the scaled data and print the model score

Conclusion: 