In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_df = pd.read_csv('Resources/2019loans.csv')
test_df = pd.read_csv('Resources/2020Q1loans.csv')
train_df.head()

In [None]:
# Convert categorical data to numeric and separate target feature for training data
X_19 = pd.get_dummies(train_df.drop(['target'], axis=1))
y_19 = train_df['target']

X_19.head()

In [None]:
# Convert categorical data to numeric and separate target feature for testing data
X_20 = pd.get_dummies(test_df.drop(['target'], axis=1))
y_20 = test_df['target']

X_20.head()

In [None]:
# add missing dummy variables to testing set
for col in X_19.columns:
    if col not in X_20.columns:
        X_20[col] = 0

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(solver='lbfgs', max_iter=15000)
classifier

classifier.fit(X_19, y_19)

In [None]:

print(f"Train Data Score: {classifier.score(X_19, y_19)}")
print(f"Test Data Score: {classifier.score(X_20, y_20)}")

In [None]:
# Train a Random Forest Classifier model and print the model score

rfc19 = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_19, y_19)

In [None]:
rfc20 = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_20, y_20)

In [None]:
print(f'Train Score: {rfc19.score(X_19, y_19)}')
print(f'Test Score: {rfc19.score(X_20, y_20)}')

In [None]:
# Scale the data
scaler = StandardScaler().fit(X_19)
X_19_scaled = scaler.transform(X_19)
X_20_scaled = scaler.transform(X_20)

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier2 = LogisticRegression(solver='lbfgs', max_iter=15000)
classifier

In [None]:

classifier2.fit(X_19, y_19)

In [None]:
print(f"Training Data Score: {classifier2.score(X_19, y_19)}")
print(f"Testing Data Score: {classifier2.score(X_20, y_20)}")

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score
scaled_rfc = RandomForestClassifier(random_state=1).fit(X_19_scaled, y_19)
y_pred = scaled_rfc.predict(X_20_scaled)

In [None]:
print(classification_report(y_20, y_pred))
print(f'Training Score: {scaled_rfc.score(X_19_scaled, y_19)}')
print(f'Testing Score: {scaled_rfc.score(X_20_scaled, y_20)}')