I predict that the Random Forest Classifier will perform better on the UNSCALED data

I predict the Logistic Regression will perform better on the SCALED data

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
pd.set_option('display.max_columns', None) # observe all columns in dataframe

# models to observe
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# reporting
from sklearn.metrics import confusion_matrix, classification_report

# for scaling data
from sklearn.preprocessing import StandardScaler

In [None]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

# EDA

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# observe ratio of training and test split
len(train_df) / (len(train_df) + len(test_df))

In [None]:
# observe target feature in training set
train_df.loan_status.value_counts()

In [None]:
# observe target feature in testing set
test_df.loan_status.value_counts()

# Data Cleaning and Conversion

In [None]:
# Convert categorical data to numeric and separate target feature for training data
train_df["loan_status"] = [1 if x == "high_risk" else 0 for x in train_df.loan_status]

# View of categorical data 
train_categorical_df = train_df.select_dtypes(include='object')
train_categorical_df.head()

In [None]:
for col in train_categorical_df.columns:
    print(train_categorical_df[col].value_counts())
    print()

In [None]:
# Converting categorical data to numeric
train_categorical_df2 = pd.get_dummies(train_categorical_df)
train_categorical_df2.head()

In [None]:
# Create and merge all numeric data
numeric_train_df = train_df.select_dtypes(exclude="object")
train_df_final = pd.merge(numeric_train_df, train_categorical_df2, left_index=True, right_index=True)
train_df_final.head()

In [None]:
# Convert categorical data to numeric and separate target feature for testing data
test_df["loan_status"] = [1 if x == "high_risk" else 0 for x in test_df.loan_status]

In [None]:
# View of categorical data 
test_categorical_df = test_df.select_dtypes(include='object')
test_categorical_df.head()

In [None]:
for col in test_categorical_df.columns:
    print(test_categorical_df[col].value_counts())
    print()

In [None]:
# Converting categorical data to numeric
test_categorical_df2 = pd.get_dummies(test_categorical_df)
test_categorical_df2.head()

In [None]:
# Create and merge all numeric data
numeric_test_df = test_df.select_dtypes(exclude="object")
test_df_final = pd.merge(numeric_test_df, test_categorical_df2, left_index=True, right_index=True)
test_df_final.head()

In [None]:
# add missing dummy variables to testing set
for col in train_df_final.columns:
    if col not in test_df_final.columns:
        print(col)

In [None]:
test_df_final["debt_settlement_flag_Y"] = 0
test_df_final.head()

In [None]:
# Final checks
train_df_final.info()

In [None]:
test_df_final.info()

In [None]:
# Removing 'Unnamed', 'index', and 'pymnt_plan_n' columns; 'pymnt_plan_n' removed due to not having corresponding 'y' column
remove_cols = ['Unnamed: 0', 'index', 'pymnt_plan_n']

train_df_final.drop(remove_cols, axis=1, inplace=True)
test_df_final.drop(remove_cols, axis=1, inplace=True)

In [None]:
X_train = train_df_final.drop("loan_status", axis=1)
y_train = train_df_final.loan_status

X_test = test_df_final.drop("loan_status", axis=1)
y_test = test_df_final.loan_status

# Training and testing models

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
lr = LogisticRegression()
lr = lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(f'Training Score: {lr.score(X_train, y_train)}')
print(f'Testing Score: {lr.score(X_test, y_test)}')
print()
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))
print()
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Train a Random Forest Classifier model and print the model score
rf = RandomForestClassifier(random_state=42, n_estimators=25)
rf = rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print(f'Training Score: {rf.score(X_train, y_train)}')
print(f'Testing Score: {rf.score(X_test, y_test)}')
print()
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))
print()
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Scale the data
scaler = StandardScaler()

# Train the scaler with the training data
scaler = scaler.fit(X_train)

# Transform the dataset with the scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score
lr_sc = LogisticRegression()
lr_sc = lr_sc.fit(X_train_scaled, y_train)

y_pred = lr_sc.predict(X_test_scaled)

print(f'Training Score: {lr_sc.score(X_train_scaled, y_train)}')
print(f'Testing Score: {lr_sc.score(X_test_scaled, y_test)}')
print()
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))
print()
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf_sc = RandomForestClassifier(random_state=42, n_estimators=25)
rf_sc = rf_sc.fit(X_train_scaled, y_train)

y_pred = rf_sc.predict(X_test_scaled)

print(f'Training Score: {rf_sc.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf_sc.score(X_test_scaled, y_test)}')
print()
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))
print()
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_pred))

In the unscaled data, the Random Forest Classifier performs better having a testing score of 0.63207 while the Logistic Regression only has a testing score of 0.51595. The f1-score is very low for predicting "high risk" loans in the Logistic Regression. The prediction of the Random Forest Classifier performing better with unscaled data is correct.

In the scaled data, the Logistic Regression performs better having a testing score of 0.76733 while the Random Forest Classifier only has a testing score of 0.63270. The f1-score is low for predicting "low risk" loans in the Random Forest Classifier compared to the Logistic Regression. The prediction of the Logistic Regression performing better with scaled data is correct.

Additional Observation: The scaled data in the Random Forest Classifier slightly improved the testing score while the Logistic Regression testing score was greatly improved.