In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [5]:
#Inspect dataframes with info() for cleaning
#train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  4702 non-null   int64  
 1   index                       4702 non-null   int64  
 2   loan_amnt                   4702 non-null   float64
 3   int_rate                    4702 non-null   float64
 4   installment                 4702 non-null   float64
 5   home_ownership              4702 non-null   object 
 6   annual_inc                  4702 non-null   float64
 7   verification_status         4702 non-null   object 
 8   loan_status                 4702 non-null   object 
 9   pymnt_plan                  4702 non-null   object 
 10  dti                         4702 non-null   float64
 11  delinq_2yrs                 4702 non-null   float64
 12  inq_last_6mths              4702 non-null   float64
 13  open_acc                    4702 

In [6]:
#cleaning test df
del test_df['Unnamed: 0']
del test_df['index']

In [7]:
#cleaning train df
del train_df['Unnamed: 0']
del train_df['index']

In [9]:
#Inspect dataframes again after cleaning
#test_df.info()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   12180 non-null  float64
 1   int_rate                    12180 non-null  float64
 2   installment                 12180 non-null  float64
 3   home_ownership              12180 non-null  object 
 4   annual_inc                  12180 non-null  float64
 5   verification_status         12180 non-null  object 
 6   loan_status                 12180 non-null  object 
 7   pymnt_plan                  12180 non-null  object 
 8   dti                         12180 non-null  float64
 9   delinq_2yrs                 12180 non-null  float64
 10  inq_last_6mths              12180 non-null  float64
 11  open_acc                    12180 non-null  float64
 12  pub_rec                     12180 non-null  float64
 13  revol_bal                   121

In [11]:
# Convert categorical data to numeric and separate target feature for training data
train_df = pd.get_dummies(train_df, columns=['home_ownership', 'verification_status',
                                            'application_type', 'hardship_flag',
                                             'debt_settlement_flag', 'initial_list_status',
                                            'pymnt_plan', 'loan_status'])

In [12]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = pd.get_dummies(test_df, columns=['home_ownership', 'verification_status',
                                            'application_type', 'hardship_flag',
                                             'debt_settlement_flag', 'initial_list_status',
                                            'pymnt_plan', 'loan_status'])

In [21]:
# Convert categorical data to numeric and separate target feature for training data
train_df=train_df.rename(columns={'loan_status_high_risk':'loan_status'})

In [22]:
# Convert categorical data to numeric and separate target feature for testing data
test_df =test_df.rename(columns={'loan_status_high_risk':'loan_status'})

In [23]:
# add missing dummy variables to testing set
test_df['debt_settlement_flag_Y'] = 0

In [24]:
y_train = train_df['loan_status']
X_train = train_df.drop('loan_status', axis=1)

In [25]:
y_test = test_df['loan_status']
X_test = test_df.drop('loan_status', axis=1)

Unscaled Prediction: Logistic regression is the model I predit will score better. The reasoning for that is due to the data we are working with has been sampled to give an even number of high and low risk loans. I think the logistic regression will work better with that type of data sample. 

In [26]:
# Train the Logistic Regression model on the unscaled data and print the model score
lr_classifier_unscaled = LogisticRegression()
lr_classifier_unscaled.fit(X_train, y_train)

print(f"Unscaled Logistic Regression Score: {lr_classifier_unscaled.score(X_test, y_test)}")

Unscaled Logistic Regression Score: 0.5159506592939175


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
# Train a Random Forest Classifier model and print the model score
rf_clf_unscaled = RandomForestClassifier(random_state=1, n_estimators=3).fit(X_train, y_train)
print(f'Unscaled RandomForest Score: {rf_clf_unscaled.score(X_test, y_test)}')

Unscaled RandomForest Score: 0.5002126754572522


Unscaled Conclusion: Logistic Regression did score marginally better than the RandomForest model with the data unscaled.  

In [33]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Scaled Prediction: I'm sticking with logistic regression as performing better with the scaled data. I think the scalling of this data sample will create a larger difference between LR and RF compared to the unscaled data. 

In [34]:
# Train the Logistic Regression model on the scaled data and print the model score
lr_classifier_scaled = LogisticRegression()
lr_classifier_scaled.fit(X_train_scaled, y_train)

print(f"Scaled Logistic Regression Score: {lr_classifier_unscaled.score(X_test_scaled, y_test)}")

Scaled Logistic Regression Score: 0.49978732454274777


In [37]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf_clf_scaled = RandomForestClassifier(random_state=1, n_estimators=3).fit(X_train_scaled, y_train)
print(f'Scaled RandomForest Score: {rf_clf_scaled.score(X_test_scaled, y_test)}')

Scaled RandomForest Score: 0.5004253509145045


Scaled Conclusion: RandomForest scored marginally better with the scaled data. The difference was minimal, but the LR score dropped with the scaled data. The RF model results performed almost identically even with the scaling. 