In [7]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt

In [8]:
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

In [9]:
X_train = train_df.drop("target", axis=1)
X_test = test_df.drop("target", axis=1)

In [10]:
# Convert categorical data to numeric and separate target feature for training data
X_train_dummy = pd.get_dummies(X_train)
X_test_dummy = pd.get_dummies(X_test)
X_test_dummy.head()



Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [11]:
# add columns to the test dataframe that are only in the training set
X_train_dummy_col = X_train_dummy.columns
X_test_dummy_col = X_test_dummy.columns

new_col = [x for x in X_train_dummy_col if x not in X_test_dummy_col]

new_col


['debt_settlement_flag_Y']

In [12]:
# change value of added columns missing from the test set to 0

X_test_dummy[new_col] = new_col
X_test_dummy[new_col] = 0

In [13]:
# add missing dummy variables to testing set
y_train = train_df['target']
y_test = test_df['target']

In [14]:
classifier = LogisticRegression()
classifier

LogisticRegression()

In [15]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier.fit(X_train_dummy, y_train)
print(f"Training Data Score: {classifier.score(X_train_dummy, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_dummy, y_test)}")

Training Data Score: 0.6530377668308702
Testing Data Score: 0.5091450446618461


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Train a Random Forest Classifier model and print the model score
rf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_dummy, y_train)
print(f'Training Score: {rf.score(X_train_dummy, y_train)}')
print(f'Testing Score: {rf.score(X_test_dummy, y_test)}')

Training Score: 1.0
Testing Score: 0.646958740961293


In [17]:
# Scale the data
scaler = StandardScaler().fit(X_train_dummy)
X_train_scaled = scaler.transform(X_train_dummy)
X_test_scaled = scaler.transform(X_test_dummy)

In [18]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier.fit(X_train_scaled, y_train)
print(f"Training Scaled Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Training Scaled Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Scaled Data Score: 0.710919540229885
Training Scaled Data Score: 0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rfs = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {rfs.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rfs.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6480221182475542
