In [24]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [25]:
# Import the data
train_df = pd.read_csv(Path('Resources/lending_data.csv'))
train_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [26]:
test_df = pd.read_csv(Path('Resources/lending_data.csv'))
test_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [27]:
y_train = train_df["loan_status"]
X_train = train_df.drop(columns = ["loan_status"])
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [28]:
# Split the data into X_train, X_test, y_train, y_test
y_test = test_df["loan_status"]
X_test = test_df.drop(columns = ["loan_status"])
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [29]:
# add missing dummy variables to testing set
# One-hot encoding dataframes
X_dummies_train = pd.get_dummies(X_train)
X_dummies_test = pd.get_dummies(X_test)
print(f"Train: {X_dummies_train.shape}, Test: {X_dummies_test.shape}")

Train: (77536, 7), Test: (77536, 7)


In [30]:
# Convert output labels to 0 and 1
y_label_train = LabelEncoder().fit_transform(train_df['loan_status'])
y_label_train

y_label_test = LabelEncoder().fit_transform(test_df['loan_status'])
y_label_test

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [31]:
# add missing dummy variables to testing set
for col in X_dummies_train.columns:
    if col not in X_dummies_test.columns:
        X_dummies_test[col]=0

In [32]:
print(f"Train: {X_dummies_train.shape}, Test: {X_dummies_test.shape}")

Train: (77536, 7), Test: (77536, 7)


## Prediction

Prediction: Random Forest Classifier will perform same as Logistic Regression. Even if random Forest Classifier performs better with more categorical data than numeric and logistic regression is a little confusing when comes to categorical data So.

In [33]:
from sklearn.linear_model import LogisticRegression
clf_logistic  = LogisticRegression(solver='lbfgs').fit(X_dummies_train, y_label_train) 

print(f"Training Data Score: {clf_logistic.score(X_dummies_train, y_label_train)}")
print(f"Testing Data Score: {clf_logistic.score(X_dummies_test, y_label_test)}");

Training Data Score: 0.9920037144036319
Testing Data Score: 0.9920037144036319


In [36]:
# Train a Logistic Regression model print the model score
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(random_state=42, n_estimators=200).fit(X_dummies_train, y_label_train)

print(f'Training Score: {clf_forest.score(X_dummies_train, y_label_train)}')

Training Score: 0.9969304581097813


In [110]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(random_state=42, n_estimators=200).fit(X_dummies_train, y_label_train)

print(f'Testing Score: {clf_forest.score(X_dummies_test, y_label_test)}')

0.9910751134956666

## Results - Unscaled

###### LR Unscaled: 
Training Data Score: 0.9920037144036319, Testing Data Score: 0.9920037144036319

###### RF Unscaled: 
Training Score: 0.9920037144036319, Testing Score: 0.9920037144036319

On unscaled data, both logistic regression and the Random Forest Classifier performed the same.

In [44]:
# Scale the data using StandardScaler()
scaler = StandardScaler().fit(X_dummies_train)
X_train_scaled = scaler.transform(X_dummies_train)
X_test_scaled = scaler.transform(X_dummies_test)


## Prediction - Scaled

##### Prediction - Scaled
Prediction: The score for Logistic Regression will remazin the same due to scaling whereas the score for Random Forest will remain the same. Graphical-model classifiers like Random Forest are invariant to feature scaling.


In [45]:
# Train the Logistic Regression model on the scaled data and print the model score
clf_lr = LogisticRegression().fit(X_train_scaled, y_label_train)

print(f'Testing Score: {clf_lr.score(X_test_scaled, y_label_test)}')

Testing Score: 0.9941317581510524


In [46]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf_rf = RandomForestClassifier(random_state=42, n_estimators=500).fit(X_train_scaled, y_label_train)

print(f'Testing Score: {clf_rf.score(X_test_scaled, y_label_test)}')

Testing Score: 0.9969304581097813


## Results - Scaled

###### LR Scaled: Testing Score: 0.9941317581510524

###### RF Scaled: Testing Score: 0.9969304581097813

Overall, scaling barely improved the score of the random forestier model to 0.0028 it outperformed the logistic regression model. This shows that sometimes a simple model with scaled data can be a better fit than one with more complexity.