In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))


In [3]:
del train_df['Unnamed: 0']

In [4]:
del train_df['index']

In [5]:
del test_df['Unnamed: 0']

In [6]:
del test_df['index']

In [7]:
# Convert categorical data to numeric and separate target feature for training data
train_df = pd.get_dummies(train_df, columns=['home_ownership','verification_status',
                                             'application_type','hardship_flag',
                                             'debt_settlement_flag','initial_list_status',
                                             'pymnt_plan','loan_status'])

In [8]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = pd.get_dummies(test_df, columns=['home_ownership','verification_status',
                                           'application_type','hardship_flag',
                                           'debt_settlement_flag','initial_list_status',
                                           'pymnt_plan','loan_status'])

In [9]:
del train_df['loan_status_low_risk']
del test_df['loan_status_low_risk']

In [10]:
train_df=train_df.rename(columns={'loan_status_high_risk':'loan_status'})

In [11]:
test_df =test_df.rename(columns={'loan_status_high_risk':'loan_status'})

In [12]:
# add missing dummy variables to testing set
test_df['debt_settlement_flag_Y'] = 0

In [13]:
y_train = train_df['loan_status']
X_train = train_df.drop('loan_status', axis=1)

In [14]:

y_test = test_df['loan_status']
X_test = test_df.drop('loan_status', axis=1)

In [15]:
# Train the Logistic Regression model on the unscaled data and print the model score
unscaled_lr_classifier = LogisticRegression()
unscaled_lr_classifier.fit(X_train, y_train)


# Train a Random Forest Classifier model and print the model score
unscaled_rf_clf = RandomForestClassifier(random_state=1, n_estimators=3).fit(X_train, y_train)


print(f"Testing Data Score for unscaled LR: {unscaled_lr_classifier.score(X_test, y_test)}")
print(f'Testing Score for unscaled RF: {unscaled_rf_clf.score(X_test, y_test)}')

Testing Data Score for unscaled LR: 0.5165886856656742
Testing Score for unscaled RF: 0.5857082092726499


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model on the scaled data and print the model score
scaled_lr_clf = LogisticRegression().fit(X_train_scaled, y_train)

# Train a Random Forest Classifier model on the scaled data and print the model score
scaled_rf_2 = RandomForestClassifier(random_state=1, n_estimators=3).fit(X_train_scaled, y_train)


print(f'Testing Score for scaled LR: {scaled_lr_clf.score(X_test_scaled, y_test)}')
print(f'Testing Score for scaled RF: {scaled_rf_2.score(X_test_scaled, y_test)}')

Testing Score for scaled LR: 0.7501063377286261
Testing Score for scaled RF: 0.5850701829008932


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:

# Use SelectFromModel to only get the important features
sel = SelectFromModel(scaled_rf_2)
sel.fit(X_train_scaled, y_train)

# Scale the data
X_selected_train = sel.transform(X_train)
X_selected_test = sel.transform(X_test)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

# Train the Logistic Regression model on the selected scaled data and print the model score
selected_lr = LogisticRegression().fit(X_selected_train_scaled, y_train)

# Train a Random Forest Classifier model on the selected scaled data and print the model score
selected_rf = RandomForestClassifier(random_state=1,n_estimators=3).fit(X_selected_train_scaled, y_train)

print(f'Testing Score for selected scaled LR: {selected_lr.score(X_selected_test_scaled, y_test)}')
print(f'Testing Score for selected scaled RF: {selected_rf.score(X_selected_test_scaled, y_test)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Testing Score for selected scaled LR: 0.7822203317737133
Testing Score for selected scaled RF: 0.5580603998298597


Logistic Regression is a better model fit for the data and is also sensitive when the data is scaled and when important features are selected. For Random Forest there is no visible change once the data is scaled and improtant features are selected.
