In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [None]:
train_df.head()

In [None]:
train_df = train_df.dropna()

In [None]:
train_cols = train_df.columns.tolist()
print(len(train_cols), train_cols)

In [None]:
train_df["loan_status"].value_counts()

In [None]:
test_df.head()

In [None]:
test_df = test_df.dropna()

In [None]:
test_cols = test_df.columns.tolist()
print(len(test_cols), test_cols)

### Comparing columns in training and testing data

In [None]:
compare = [i for i in train_cols if i not in test_cols]
compare

### Convert categorical data to numeric and separate target feature for training data

In [None]:
X = train_df.drop('loan_status', axis=1)
X.head()

In [None]:
X_train = pd.get_dummies(X)
X_train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert output labels to 0 and 1
y_train = LabelEncoder().fit_transform(train_df['loan_status'])
y_train

### Convert categorical data to numeric and separate target feature for testing data

In [None]:
X_1 = test_df.drop('loan_status', axis=1)
X_1.head()

In [None]:
X_test = pd.get_dummies(X_1)
X_test.head()

In [None]:
# Convert output labels to 0 and 1
y_test = LabelEncoder().fit_transform(test_df['loan_status'])
y_test

In [None]:
# Fixing "ValueError: X has 93 features per sample; expecting 94"
# Find missing column and fill with 0
missing_col = [i for i in X_train.columns if i not in X_test.columns]
missing_col

In [None]:
X_test["debt_settlement_flag_Y"] = 0

In [None]:
X_test.head()

### Create Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

### Create a confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test
y_pred = classifier.predict(X_test)
array = confusion_matrix(y_true, y_pred)
array

In [None]:
print(classification_report(y_true, y_pred))

### Visualizing the confusion matrix

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt

confusion_df = pd.DataFrame(array)
sn.set(font_scale=1.4) # for label size
sn.heatmap(confusion_df, annot=True, annot_kws={"size": 14})
plt.show()

### Create RandomForestClassifier model for scaled data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
feature_importances = clf.feature_importances_

In [None]:
features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

### Create RandomForestClassifier model for unscaled data

In [None]:
clf_1 = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf_1.score(X_train, y_train)}')
print(f'Testing Score: {clf_1.score(X_test, y_test)}')

### Logistic Regression model on the scaled data

In [None]:
# Train a Random Forest Classifier model and print the model score
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
y_true = y_test
y_pred = classifier.predict(X_test_scaled)
array_1 = confusion_matrix(y_true, y_pred)
array_1

In [None]:
print(classification_report(y_true, y_pred))

### Visualize confusion matrix for Logistic Regression for scaled data

In [None]:
confusion_df1 = pd.DataFrame(array_1)
sn.set(font_scale=1.4) # for label size
sn.heatmap(confusion_df1, annot=True, annot_kws={"size": 14})
plt.show()

# Conclusion (Logistic Regression vs Random Forest Classifier)

## Logistic Regression Model 
<hr>

### For the unscaled data, the scores for the Logistic Regression:
- Training Data Score: 0.6485221674876848
- Testing Data Score: 0.5253083794130158

### For the scaled data, the scores for the Logistic Regression:
- Training Data Score: 0.713136288998358
- Testing Data Score: 0.7201190982560612

### As one can see from the scores above, the best model to predict the Credit Risk is the Logistic Regression on the scaled data. The training scores and testing scores are much more closer than the unscaled data, hence allowing us to make a more accurate prediction.

## Random Forest Classifier Model
<hr>

### For unscaled data, the scores for the Random Forest Classifier Model: 
- Training Score: 1.0
- Testing Score: 0.6180348787749894

### For scaled data, the scores for the Random Forest Classifier Model: 
- Training Score: 1.0
- Testing Score: 0.6193109315185028

### As one can see from the scores above, the Random Forest Classifier won't be very efficient in predicting the Credit Risks. The differences between the scores on the scaled and unscaled data are almost identical (The difference is negligible. 

### In conclusion, for this exercise, Logistic Regression will be a better model to use. 