In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
train_df = pd.get_dummies(train_df, columns=['home_ownership','initial_list_status', 'application_type', 'hardship_flag', \
                                 'debt_settlement_flag', 'verification_status', 'pymnt_plan', 'target'], drop_first=True)
train_df

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,initial_list_status_w,application_type_Joint App,hardship_flag_Y,debt_settlement_flag_Y,verification_status_Source Verified,verification_status_Verified,target_low_risk
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,1,0,0,0,0,0,0,0,0,1
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,1,0,0,1,0,0,0,1,0,1
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,0,0,1,1,0,0,0,0,1,1
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,1,0,0,1,1,0,0,0,0,1
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,1,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,0.0,9679.0,...,0,0,1,1,0,0,0,0,0,0
12176,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,0.0,3193.0,...,0,0,1,1,0,0,0,0,1,0
12177,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,0.0,11804.0,...,0,0,1,1,0,0,0,0,0,0
12178,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,0.0,157.0,...,1,0,0,1,1,0,0,1,0,0


In [4]:
len(train_df['target_low_risk'])

12180

In [5]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = pd.get_dummies(test_df, columns=['home_ownership','initial_list_status', 'application_type', 'hardship_flag', \
                                 'debt_settlement_flag', 'verification_status', 'pymnt_plan', 'target'], drop_first=True)
test_df

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,total_il_high_credit_limit,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,initial_list_status_w,application_type_Joint App,hardship_flag_Y,verification_status_Source Verified,verification_status_Verified,target_low_risk
0,40000.0,0.1033,856.40,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,14515.0,0,0,1,1,0,1,1,0,1
1,24450.0,0.1430,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,18925.0,1,0,0,1,0,0,0,0,1
2,13500.0,0.1430,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,61724.0,0,1,0,1,0,1,0,0,1
3,10625.0,0.1774,268.31,60000.0,15.70,0.0,4.0,17.0,0.0,6216.0,...,26255.0,0,0,1,1,0,0,0,1,1
4,6375.0,0.1862,232.46,60000.0,35.50,0.0,0.0,13.0,0.0,12681.0,...,72345.0,0,0,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,23215.0,...,107388.0,0,0,1,0,0,0,1,0,0
4698,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,459.0,...,30775.0,0,0,1,1,0,0,0,0,0
4699,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,6342.0,...,29550.0,0,0,1,0,0,0,0,1,0
4700,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,11636.0,...,9657.0,0,0,1,1,0,0,1,0,0


In [6]:
# add missing dummy variables to testing set
# list(test_df)
test_df['debt_settlement_flag'] = 0


In [7]:
X_train = train_df.drop(['target_low_risk'], axis=1)
y_train = train_df['target_low_risk']
X_test = test_df.drop(['target_low_risk'], axis=1)
y_test = test_df['target_low_risk']

print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

12180
12180
4702
4702


In [8]:
# (AI) Scale x_train and x_test 
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
print(len(X_train))
print(len(X_train_scaled))
print(len(y_train))
print(len(X_test))
print(len(X_test_scaled))



12180
12180
12180
4702
4702


## Educated Guess on Better Model (LogisticRegression vs. RandomTreeClassifier)
Looking at the variability of the coeffeicient, Logistic Regression would likely not be a good model as it could get confused the different coefficients; whereas, Random Tree classifier a random forest algorithm will sample the data and build several smaller, simpler decisions trees, each tree is much simpler because it is built from a subset of the data. Each tree is considered a “weak classifier” but when you combine them, they form a “strong classifier.”


## Fit model to unscaled and scaled data on LogisticRegression

In [10]:
# Train the Logistic Regression model on the unscaled data and print the model score
# (AI) Fit unscaled data to Linear Regression model
reg = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
reg.score(X_test, y_test)
# print(reg.coef_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.5087196937473416

In [11]:
# Fit the Logistic Regression model on the scaled data and print the model score
reg = LogisticRegression().fit(X_train_scaled, y_train)
reg.score(X_test_scaled, y_test)
# print(reg.coef_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.67333049766057

## Fit model to unscaled and scaled data on RandomForestClassifier before feature selection

In [12]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6407911527009783


In [13]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6405784772437261


## Feature selection

In [14]:
print(len(X_train))
print(len(X_train_scaled))
print(len(y_train))
print(len(X_test))
print(len(X_test_scaled))



12180
12180
12180
4702
4702


In [15]:
# Get feature importance
features = clf.feature_importances_
features = sorted(features, reverse=True)


In [16]:
print(len(X_train))
print(len(X_train_scaled))
print(len(y_train))
print(len(X_test))
print(len(X_test_scaled))

12180
12180
12180
4702
4702


In [17]:
# Use SelectFromModel to only get the important features
from sklearn.feature_selection import SelectFromModel

# create selection object by passing in the model fitted above
# Note, the original model MUST be one that has a feature_importances_ or coef_
sel = SelectFromModel(clf)

In [18]:
# Fit selection object to the training set
sel.fit(X_train_scaled, y_train)

# Displays an array indicating whether given feature is important
sel.get_support()

array([ True,  True,  True,  True,  True, False, False, False, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False, False, False, False,  True, False,
       False, False, False,  True,  True,  True, False, False,  True,
        True,  True, False, False, False, False,  True,  True,  True,
       False, False,  True,  True,  True, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False])

In [19]:
print(len(X_train))
print(len(X_train_scaled))
print(len(y_train))
print(len(X_test))
print(len(X_test_scaled))

12180
12180
12180
4702
4702


In [22]:
# Retrain entire model on just selected features by transforming features
# Using the sklearn selection object
# X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=1)
X_selected_train = sel.transform(X_train_scaled)
X_selected_test = sel.transform(X_test_scaled)

# Scale the data
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

In [23]:
print(len(X_train))
print(len(X_train_scaled))
print(len(X_selected_train_scaled))
print(len(y_train))
print(len(X_test))
print(len(X_test_scaled))
print(len(X_selected_test_scaled))



12180
12180
12180
12180
4702
4702
4702


## Educated guess on LogisticRegression vs. RandomForestRegression. RF still seems to be the better model to use based on the variability of the features.

In [24]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6108039132284134


In [25]:
# Fit the selected featured scaled data nto 
clf = LogisticRegression()
clf.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')

Training Score: 0.7094417077175698
Testing Score: 0.7735006380263718


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# Scale the data