In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('processed_dataframe.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 41 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   loan_type                                 1100 non-null   int64
 1   loan_amount                               1100 non-null   int64
 2   action_taken                              1100 non-null   int64
 3   occupancy_type                            1100 non-null   int64
 4   census_tract                              1100 non-null   int64
 5   applicant_ethnicity_1                     1100 non-null   int64
 6   co_applicant_ethnicity_1                  1100 non-null   int64
 7   applicant_race_1                          1100 non-null   int64
 8   applicant_race_2                          1100 non-null   int64
 9   co_applicant_race_1                       1100 non-null   int64
 10  co_applicant_race_2                       1100 non-null   in

In [4]:
df.corr()

Unnamed: 0,loan_type,loan_amount,action_taken,occupancy_type,census_tract,applicant_ethnicity_1,co_applicant_ethnicity_1,applicant_race_1,applicant_race_2,co_applicant_race_1,...,open_end_line_of_credit,manufactured_home_land_property_interest,total_loan_costs,total_points_and_fees,prepayment_penalty_term,negative_amortization,interest_only_payment,balloon_payment,other_nonamortizing_features,multifamily_affordable_units
loan_type,1.0,-0.00946,0.075778,-0.193442,0.029962,0.012631,-0.031599,-0.001935,-0.048259,-0.025998,...,,-0.049147,0.342217,,,,0.03572,,,
loan_amount,-0.00946,1.0,0.076808,-0.017805,-0.081538,0.093086,-0.127134,0.063186,0.019717,-0.130315,...,,0.099371,0.193817,,,,0.070583,,,
action_taken,0.075778,0.076808,1.0,0.030551,-0.00941,0.139385,-0.050211,0.10114,0.000476,-0.042892,...,,-0.047341,0.144069,,,,0.057705,,,
occupancy_type,-0.193442,-0.017805,0.030551,1.0,-0.035416,0.060862,0.021492,0.019558,0.02689,-0.004697,...,,0.030427,-0.085401,,,,0.015218,,,
census_tract,0.029962,-0.081538,-0.00941,-0.035416,1.0,0.028195,0.010626,-0.060126,0.007328,-0.011765,...,,-0.031881,0.108621,,,,-0.033061,,,
applicant_ethnicity_1,0.012631,0.093086,0.139385,0.060862,0.028195,1.0,0.154155,0.458343,-0.007964,0.095576,...,,-0.002711,0.026247,,,,0.009585,,,
co_applicant_ethnicity_1,-0.031599,-0.127134,-0.050211,0.021492,0.010626,0.154155,1.0,0.02978,0.002217,0.932704,...,,-0.027208,-0.126272,,,,-0.006718,,,
applicant_race_1,-0.001935,0.063186,0.10114,0.019558,-0.060126,0.458343,0.02978,1.0,-0.390415,0.150484,...,,-0.002796,0.07376,,,,0.046403,,,
applicant_race_2,-0.048259,0.019717,0.000476,0.02689,0.007328,-0.007964,0.002217,-0.390415,1.0,-0.107949,...,,0.025485,-0.004094,,,,0.007993,,,
co_applicant_race_1,-0.025998,-0.130315,-0.042892,-0.004697,-0.011765,0.095576,0.932704,0.150484,-0.107949,1.0,...,,-0.0234,-0.085753,,,,-0.018818,,,


In [5]:
target_variable = "action_taken"

In [6]:
X = df.drop(columns=[target_variable])
y = df[target_variable]

In [7]:
print(X.shape, y.shape)

(1100, 40) (1100,)


In [8]:
seed = 123

In [9]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs')
cv = RepeatedStratifiedKFold(n_splits = 6, n_repeats = 3, random_state = seed)
n_scores = cross_val_score(model, X, y, scoring = 'accuracy', cv=cv)

print('Mean accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Mean accuracy: 0.560 (0.002)


In [10]:
model.fit(X, y)

In [15]:
import joblib

# Save the model to a file
filename = 'logistic_regression_model.pkl'
joblib.dump(model, filename)
print('Model saved to', filename)

Model saved to logistic_regression_model.pkl


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=seed)

In [12]:
X_test.shape

(220, 40)

In [13]:
X_train.shape

(880, 40)

In [14]:
# 1- loan originated
# 2- application approved but not accepted
# 3- application denied
# 4 - application withdrawn by applicant
# 5 - file closed for incompleteness
# 6 - purchased loan
# 7 - Pre-approval request denied
# 8 - pre-approval request approved but not accepted

y_train.value_counts()

action_taken
6    493
4    160
1    122
5     46
3     44
2     15
Name: count, dtype: int64