# Import dependencies and data

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

# Preprocess data

In [26]:
# Convert categorical data to numeric and separate target feature for training data
from sklearn.preprocessing import LabelEncoder
X_train = train_df.drop('loan_status',axis=1)
y_train = train_df['loan_status']

X_train_dummies = pd.get_dummies(X_train)
y_train_label = LabelEncoder().fit_transform(y_train)
X_train_dummies.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0


In [17]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop('loan_status',axis=1)
y_test = test_df['loan_status']

X_test_dummies = pd.get_dummies(X_test)
y_test_label = LabelEncoder().fit_transform(y_test)

In [28]:
# add missing dummy variables to testing set
X_test_dummies['debt_settlement_flag_Y']=0
X_test_dummies.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,1,0,1,1,0,1,0,1,0


# Unscaled Models

### Prediction:
I believe the Random Forests classifier will do a better job because of the large number of features, as it is a tree-based algorithm. Logistic Regression is better with fewer features because it is still attempting to draw a single separation between a binary result.

### Logistic Regression Model

In [29]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train_dummies,y_train_label)
classifier.score(X_train_dummies,y_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6485221674876848

### Random Forest Model

In [30]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1,n_estimators=50).fit(X_train_dummies,y_train_label)
clf.score(X_train_dummies,y_train_label)

0.9999178981937603

### Conclusion:
The Logistic Regression model did not do a good job at sorting the data, probably due to the large amount of features. The Random Forest Model did an almost perfect job of sorting the data, due to its nature of creating many simple decision trees rather than one split of a high deminsionality dataset.

# Scale

In [31]:
# Scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)

# Scaled Models

### Prediction:
The scaling will affect Logistic Regression because it is distance based. Random Forest is not. It is based on multiple decision trees, which don't have a deminsionality component.

### Logistic Regression Model

In [32]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression()
classifier.fit(X_train_scaled,y_train_label)
classifier.score(X_train_scaled,y_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.713136288998358

### Random Forest Model

In [33]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1,n_estimators=50).fit(X_train_scaled,y_train_label)
clf.score(X_train_scaled,y_train_label)

0.9999178981937603

### Conclusion:
The prediction turned out to be correct, with the Logistic Regression model improving and the Random Forest Model staying the same. The Logistic Regression Model still is not very accurate, while the Random Forest Model was almost perfect to begin with, and stayed that way after the scaling. PCA would help immensely with the Logistic Regression Model.