# HMDA Logistic Classifier
## Allen Church
### PPOL 565
#### May 4, 2020

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [5]:
df = pd.read_csv("hmda_2018_subset.csv", usecols=['action_taken','state_code', 'derived_race', 'derived_sex', 'loan_amount', 
                                                 'loan_term', 'income', 'debt_to_income_ratio', 'applicant_age', 'tract_minority_population_percent', 'ffiec_msa_md_median_family_income'])
df.isnull().sum()

state_code                             727
derived_race                             0
derived_sex                              0
action_taken                             0
loan_amount                              0
loan_term                              736
income                                5852
debt_to_income_ratio                 16815
applicant_age                            0
tract_minority_population_percent        0
ffiec_msa_md_median_family_income        0
dtype: int64

# Data Cleaning

In [6]:
df = df.dropna()

In [7]:
# action_taken equals 3 (denied) or 1 (approved)
df_apr_deny = df.loc[(df.action_taken==3) | (df.action_taken==1)]

# only selecting values for African American or White
df_apr_deny = df_apr_deny.loc[(df_apr_deny.derived_race=='White') | (df_apr_deny.derived_race=='Black or African American')]

# removing exempt value from debt_income ratio and loan_term, as these are esentially NAs
df_apr_deny = df_apr_deny[df_apr_deny["debt_to_income_ratio"].str.contains('Exempt')==False]
df_apr_deny = df_apr_deny[df_apr_deny["loan_term"].str.contains('Exempt')==False]

# Label Encoding so that $0$ == Approved, $1$ == Denied

In [8]:
y = df_apr_deny['action_taken']
lb = LabelEncoder()
lb.fit(y)
print(lb.classes_)
y = lb.transform(y)

[1 3]


## Creating Feature Matrix

In [9]:
X = df_apr_deny.drop(['action_taken'], axis=1)

In [10]:
# below transforms derived_race so that white = 1 and African American = 0
lb.fit(X["derived_race"])
X["derived_race"] = lb.transform(X["derived_race"])

# encoding multiple class variables into dummy variables
X = pd.get_dummies(X, columns=["state_code","derived_sex","applicant_age"])
X.head()

Unnamed: 0,derived_race,loan_amount,loan_term,income,debt_to_income_ratio,tract_minority_population_percent,ffiec_msa_md_median_family_income,state_code_AK,state_code_AL,state_code_AR,...,derived_sex_Male,derived_sex_Sex Not Available,applicant_age_25-34,applicant_age_35-44,applicant_age_45-54,applicant_age_55-64,applicant_age_65-74,applicant_age_8888,applicant_age_<25,applicant_age_>74
0,1,195000,360,78.0,60,14.04,82400,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,1,205000,360,64.0,45,11.8,76600,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,1,105000,361,41.0,55,8.64,74800,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,125000,20,142.0,37,7.71,60500,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,0,275000,360,85.0,41,77.77,75600,0,0,0,...,0,0,0,0,1,0,0,0,0,0


# Logistic Regression

In [11]:
# specified max_iter due to warning message, default is 100
log_reg = LogisticRegression(max_iter=1000, solver='lbfgs')

In [12]:
log_reg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [14]:
cross_val_score(log_reg, X_train, y_train, cv=10)

array([0.79373952, 0.79541643, 0.79362416, 0.79474273, 0.799217  ,
       0.7852349 , 0.80246223, 0.77224398, 0.79798545, 0.80246223])

# Feature Importance and Coefficient Estimates

In [15]:
coef = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(log_reg.coef_))], axis = 1)
coef.columns = ['var', 'estimate']
coef.nlargest(10, 'estimate')

Unnamed: 0,var,estimate
4,debt_to_income_ratio,0.046198
5,tract_minority_population_percent,0.003107
66,applicant_age_55-64,0.000227
59,derived_sex_Female,0.000213
11,state_code_CA,0.000208
61,derived_sex_Male,0.000196
70,applicant_age_>74,0.000185
65,applicant_age_45-54,0.000179
67,applicant_age_65-74,0.000174
16,state_code_FL,0.00013


# Validation

In [16]:
round(log_reg.score(X, y),3)

0.792

In [17]:
confusion_matrix(y, log_reg.predict(X))

array([[19253,   446],
       [ 4864,   977]])

In [18]:
print(classification_report(y, log_reg.predict(X)))

              precision    recall  f1-score   support

           0       0.80      0.98      0.88     19699
           1       0.69      0.17      0.27      5841

    accuracy                           0.79     25540
   macro avg       0.74      0.57      0.57     25540
weighted avg       0.77      0.79      0.74     25540



The confusion matrix displays results from the regression where one class was confused for another. On the diagonals of the confusion matrix are the True Positive and True Negative counts, respectively. All other rows are misclassification counts. In this case, the overall fraction of correct predictions is given by the sum of correct predictions over the sum of all predictions, or $0.69\%$. The confusion matrix above displays $19253$ True Negatives, $4864$ False Negatives,  $446$ False Positives, $977$ True Postives. The results above indicate that the logistic regression model is most likely to make a True Negative classification.