In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np

In [10]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [11]:
#checking the data to validate non-numeric values of targets
print(train_df["target"].value_counts())
print(test_df["target"].value_counts())

high_risk    6090
low_risk     6090
Name: target, dtype: int64
high_risk    2351
low_risk     2351
Name: target, dtype: int64


In [24]:
# Convert categorical data to numeric and separate target feature for training data
# Drop the label to create the X data
train_X_step = train_df.drop('target', axis=1)
test_X_step = test_df.drop('target', axis=1)

# categorical conversion
train_X = pd.get_dummies(train_X_step)
test_X = pd.get_dummies(test_X_step)

#create the targets
train_Y = train_df["target"]
test_Y = test_df["target"]

In [27]:
# add missing dummy variables to testing set
for X in train_X.columns:
    if X not in test_X.columns:
        test_X[X]=0

### Model Consideration

Considering this data is pruned for the purposes of this data set, I would predict that the Logistic Regression would be a better fit. Random Forest would be probably better for the very large, unprocessed data set (I think?).

In [38]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()

classifier.fit(train_X, train_Y)

print(f"Training Data Score: {classifier.score(train_X, train_Y)}")
print(f"Testing Data Score: {classifier.score(test_X, test_Y)}")

Training Data Score: 0.6532019704433497
Testing Data Score: 0.5082943428328371


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=1, n_estimators=500)

forest.fit(train_X, train_Y)

print(f'Training Score: {forest.score(train_X, train_Y)}')
print(f'Testing Score: {forest.score(test_X, test_Y)}')

Training Score: 1.0
Testing Score: 0.646958740961293


In [39]:
# Scale the data
scaler = StandardScaler()

scaler.fit(train_X)

train_X_scaled = scaler.transform(train_X)
test_X_scaled = scaler.transform(test_X)

In [40]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression()

classifier.fit(train_X_scaled, train_Y)

print(f"Training Data Score: {classifier.score(train_X_scaled, train_Y)}")
print(f"Testing Data Score: {classifier.score(test_X_scaled, test_Y)}")

Training Data Score: 0.710919540229885
Testing Data Score: 0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
# Train a Random Forest Classifier model on the scaled data and print the model score

forest = RandomForestClassifier(random_state=1, n_estimators=500)

forest.fit(train_X_scaled, train_Y)

print(f'Training Score: {forest.score(train_X_scaled, train_Y)}')
print(f'Testing Score: {forest.score(test_X_scaled, test_Y)}')

Training Score: 1.0
Testing Score: 0.6480221182475542
