In [625]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

## Read the .csv data from the export folder into a Pandas DataFrame.

In [626]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data1 = Path('export/2018_Chase_Loan_Cleaned.csv')
data2 = Path('export/2019_Chase_Loan_Cleaned.csv')
data3 = Path('export/2020_Chase_Loan_Cleaned.csv')
data4 = Path('export/2021_Chase_Loan_Cleaned.csv')
data5 = Path('export/2022_Chase_Loan_Cleaned.csv')

df = pd.read_csv(data5)

#FINAL VERSION over 75%:

#using 2,3,5,6,7,8 results from  acition_taken and dropping 1 and 4.  
#also only home loan appplications - no re-fi or cash taken out
#resulted in 85% accuracy!!
additional_drops_for_ML = ['activity_year','loan_type','loan_purpose', 'business_or_commercial_purpose', 'state_code']

#Previous attempts:
#67% / 68% 
#additional_drops_for_ML = ['activity_year','loan_type','loan_purpose', 'business_or_commercial_purpose', 'state_code', 'construction_method', 'occupancy_type', 'total_units']

#66% / 67%
#additional_drops_for_ML = ['activity_year','loan_type','loan_purpose', 'business_or_commercial_purpose', 'state_code', 'construction_method', 'occupancy_type', 'total_units', 'county_code', 'property_value']

#using original data source only
#59%
#additional_drops_for_ML = ['intro_rate_period', 'multifamily_affordable_units', 'prepayment_penalty_term','combined_loan_to_value_ratio','rate_spread', 'total_points_and_fees','co_applicant_race_2','co_applicant_race_3','co_applicant_race_4','co_applicant_race_5','applicant_race_2','applicant_race_3','applicant_race_4','applicant_race_5','co_applicant_ethnicity_2','co_applicant_ethnicity_3','co_applicant_ethnicity_4',
#'co_applicant_ethnicity_5','applicant_ethnicity_2','applicant_ethnicity_3','applicant_ethnicity_4','applicant_ethnicity_5','denial_reason_2', 'denial_reason_3', 'denial_reason_4', 'aus_1', 'aus_2', 'aus_3', 'aus_4', 'aus_5', 'lei', 'activity_year', 'state_code', 'debt_to_income_ratio', 'applicant_age', 'co_applicant_age', 'total_units', 'applicant_age_above_62', 'co_applicant_age_above_62']


#drop the columns above that do not have at least 2 values
df = df.drop(columns=additional_drops_for_ML)

# Review the DataFrame
df.head()

Unnamed: 0,action_taken,construction_method,occupancy_type,county_code,loan_amount,income,loan_term,property_value,total_units
0,1,1,1,26099,415000,119000,360,435000,1
1,1,1,1,48113,405000,97000,360,425000,1
2,1,1,1,36047,375000,152000,360,505000,1
3,1,1,1,48201,265000,138000,360,335000,1
4,1,1,1,29183,225000,89000,360,255000,1


In [627]:
#testing the data to see make sure its not garbage in - garbage out.

#export to CSV
#df.to_csv('export/where_is_bad_data.csv', index=False)

## Create the labels set (y) from the "action_taken" column, and then create the features (X) DataFrame from the remaining columns.

In [628]:
#y is the action taken results approved or denied applications
y = df['action_taken']

#X is the remaining columns
X = df.drop(columns='action_taken')

In [629]:
#display y
y

0        1
1        1
2        1
3        1
4        1
        ..
52866    1
52867    1
52868    1
52869    1
52870    1
Name: action_taken, Length: 52871, dtype: int64

In [630]:
#display x values
X

Unnamed: 0,construction_method,occupancy_type,county_code,loan_amount,income,loan_term,property_value,total_units
0,1,1,26099,415000,119000,360,435000,1
1,1,1,48113,405000,97000,360,425000,1
2,1,1,36047,375000,152000,360,505000,1
3,1,1,48201,265000,138000,360,335000,1
4,1,1,29183,225000,89000,360,255000,1
...,...,...,...,...,...,...,...,...
52866,1,1,12057,295000,58000,360,375000,1
52867,1,1,13121,205000,258000,360,385000,1
52868,1,1,4013,545000,310000,360,685000,1
52869,1,1,6085,355000,155000,360,975000,1


## Check the balance of the labels variable (y).

In [631]:
#make sure our y values have at least 2 values and how many are within neach
y.value_counts()


1    42306
0    10565
Name: action_taken, dtype: int64

## Split the data into training and testing datasets by using train_test_split.

In [632]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(39653, 8)

## Create a Logistic Regression Model with the Original Data
* Fit a logistic regression model by using the training data (X_train and y_train).

In [633]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [634]:
# Fit the model using training data
classifier.fit(X_train, y_train)

## Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [635]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

#tried to drop the indexes to see if it will map plot - did not help
#results = pd.DataFrame({"Prediction": predictions, "Actual": y_test})

results.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


## Evaluate the model’s performance 

In [636]:
# Print the balanced_accuracy score of the model
from sklearn.metrics import accuracy_score

# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.8409744288091996

In [637]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8405921367866239
Testing Data Score: 0.8409744288091996


In [638]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[  564,  2077],
       [   25, 10552]], dtype=int64)

In [639]:
# Print the classification report for the model
from sklearn.metrics import classification_report
target_names = ["Approved", "Denied"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

    Approved       0.96      0.21      0.35      2641
      Denied       0.84      1.00      0.91     10577

    accuracy                           0.84     13218
   macro avg       0.90      0.61      0.63     13218
weighted avg       0.86      0.84      0.80     13218



## Conclusions here
* Finally after multiple attempts we were able to achieve a 85% or higher accuracy score for predicting the approved/denied status for loans based on the exported cleaned data CSV files.

* Due to the amount of data being in the 10's of thousands for each data set we did not need to do any Resampled data testing.  