In [35]:
#Findings: 

# Overall I was suprised to see that the Random Forest 
# Classification did not have a noticable difference after it was rescaled.
# Logistic Regression did offer an improvement after re-scaling. I would 
# have expected a greater improvement with regards to Random Forest 
# Classification.  Because there appears to be a linear relationship between
# cause and effect, Linear Regression is the better choice compared to 
#Random Forst.

In [36]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model  import LogisticRegression
from sklearn.ensemble      import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [37]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df  = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [38]:
# Convert categorical data to numeric and separate target feature for training data

train_df = pd.get_dummies(train_df, columns=['home_ownership','verification_status','loan_status','application_type',
                                             'pymnt_plan','initial_list_status','hardship_flag','debt_settlement_flag'])

test_df = pd.get_dummies(test_df, columns=['home_ownership','verification_status','loan_status','application_type',
                                           'pymnt_plan','initial_list_status','hardship_flag','debt_settlement_flag'])

In [39]:
train_df.drop("Unnamed: 0",axis=1,inplace=True)
test_df.drop("Unnamed: 0", axis=1,inplace=True)

In [40]:
print(test_df.columns)

Index(['index', 'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_

In [41]:
#Add missing dummy variables to testing set

#Source: https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data

# Get missing columns in the training test
missing_cols = set( train_df.columns ) - set( test_df.columns )

# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_df[c] = 0
    
# Ensure the order of column in the test set is in the same order than in train set
test_df = test_df[train_df.columns]

In [42]:
train_df.drop('loan_status_low_risk', axis=1, inplace=True)
test_df.drop('loan_status_low_risk',  axis=1, inplace=True)

train_df.rename( columns = {'loan_status_high_risk':'loan_status'}, inplace=True)
test_df.rename(  columns = {'loan_status_high_risk':'loan_status'}, inplace=True)

In [43]:
# Train the Logistic Regression model on the unscaled data and print the model score

#Source: https://stackoverflow.com/questions/60636444/what-is-the-difference-between-x-test-x-train-y-test-y-train-in-sklearn
y_train = train_df['loan_status']
X_train = train_df.drop('loan_status', axis=1)

y_test = test_df['loan_status']
X_test = test_df.drop('loan_status', axis=1)

unscaled_lr = LogisticRegression()
unscaled_lr.fit(X_train, y_train)

print(f"Unscaled Logistic Regression Score: {unscaled_lr.score(X_test, y_test)}")

Unscaled Logistic Regression Score: 0.5208421948107188


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
# Train a Random Forest Classifier model and print the model score
unscaled_rfc = RandomForestClassifier(random_state=1, n_estimators=3).fit(X_train, y_train)

print(f'Random Forest Classifier Model Score: {unscaled_rfc.score(X_test, y_test)}')

Random Forest Classifier Model Score: 0.5295618885580604


In [45]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [46]:
# Train the Logistic Regression model on the scaled data and print the model score
scaled_lr_clf = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Testing Score for scaled LR: {scaled_lr_clf.score(X_test_scaled, y_test)}')

Testing Score for scaled LR: 0.7216078264568269


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# Train a Random Forest Classifier model on the scaled data and print the model score
scaled_rf_2 = RandomForestClassifier(random_state=1, n_estimators=3).fit(X_train_scaled, y_train)
print(f'Testing Score for scaled RF: {scaled_rf_2.score(X_test_scaled, y_test)}')

Testing Score for scaled RF: 0.5287111867290515
