In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Problem Statement

## Background

"The Home Mortgage Disclosure Act (HMDA) requires many financial institutions to maintain, report, and publicly disclose loan-level information about mortgages. These data help show whether lenders are serving the housing needs of their communities; they give public officials information that helps them make decisions and policies; and they shed light on lending patterns that could be discriminatory. The public data are modified to protect applicant and borrower privacy.

HMDA was originally enacted by Congress in 1975 and is implemented by Regulation C." - [source](https://www.consumerfinance.gov/data-research/hmda/)

## Resources

[Download the Dataset](https://ffiec.cfpb.gov/data-browser/data/2021?category=states&items=OR)

[2021 HMDA Documentation](https://ffiec.cfpb.gov/documentation/2021/)

[2021 data-feild specification](https://ffiec.cfpb.gov/documentation/2021/lar-data-fields/)

[2021 HMDA Guide](https://www.ffiec.gov/hmda/pdf/2021Guide.pdf)

## Design

classifier 1: predict "action taken" column which is either


* Labels
    * Positive
        * code 1 - Loan originated 
        * code 2 - Approved but not accepted 
          (Note: this counts, because we are focused on loan /approval/, more than loans approved-and-accepted)
    * Negative
        * code 3 - Loan Denied 
          (See denail-reason for more information)


* Feature columns
    * Business_or_commercial_purpose
    * loan_to_value_ratio (aka Combined_loan_to_value_ratio)
    * interest_rate
    * hoepa_status
    * loan_term
    * property_value
    * construction_method
    * property_value
    * occupancy_type
    * income
    * debt_to_income_ratio
    * submission_of_application
    * aus-1


* protected classes:
    * race
    * ethnicity
    * gender
    * age
    * tract_minority_population_percent
    * other "tract" columns

# Data Loading

In [None]:
path = '/content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/'

In [None]:
filename = path + 'df.csv'
df = pd.read_csv(filename)#.reset_index(drop=True)
print('Data Loaded from:', filename)

Data Loaded from: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df.csv


# Data Splitting

In [None]:
input_features = [
        'loan_type',
        'loan_purpose',
        'business_or_commercial_purpose',
        'loan_to_value_ratio',
        'interest_rate',
        'hoepa_status',
        'loan_term',
        'property_value',
        'construction_method',
        'occupancy_type',
        'income',
        'debt_to_income_ratio',
        'submission_of_application',
        'aus-1'
]

label_features = ['action_taken']

audit_features = [
    'derived_sex',
    'derived_race',
    'derived_ethnicity'
]

In [None]:
df_input = df[input_features]
df_label = df[label_features]
df_audit = df[audit_features]

In [None]:
filename = path + 'df-input_with-bias-mitigation.csv'
df_input.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'df-label_with-bias-mitigation.csv'
df_label.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'df-audit_with-bias-mitigation.csv'
df_audit.to_csv(filename, index=False)
print('Data Saved to:', filename)

Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df-input_with-bias-mitigation.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df-label_with-bias-mitigation.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df-audit_with-bias-mitigation.csv


In [None]:
n = len(df)
split_index = int(0.75 * n)
x_train = df_input[:split_index]
x_test  = df_input[split_index+1:]
y_train = df_label[:split_index]
y_test  = df_label[split_index+1:]
z_train = df_audit[:split_index]
z_test  = df_audit[split_index+1:]

print(f"x_train = {x_train.shape}")
print(f"y_train = {y_train.shape}")
print(f"z_train = {z_train.shape}")
print(f"x_test  = {x_test.shape}")
print(f"y_test  = {y_test.shape}")
print(f"z_test  = {z_test.shape}")

x_train = (565132, 14)
y_train = (565132, 1)
z_train = (565132, 3)
x_test  = (188377, 14)
y_test  = (188377, 1)
z_test  = (188377, 3)


In [None]:
filename = path + 'x_train_with-bias-mitigation.csv'
x_train.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'x_test_with-bias-mitigation.csv'
x_test.to_csv(filename, index=False)
print('Data Saved to:', filename)


filename = path + 'y_train_with-bias-mitigation.csv'
y_train.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'y_test_with-bias-mitigation.csv'
y_test.to_csv(filename, index=False)
print('Data Saved to:', filename)


filename = path + 'z_train_with-bias-mitigation.csv'
z_train.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'z_test_with-bias-mitigation.csv'
z_test.to_csv(filename, index=False)
print('Data Saved to:', filename)

Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/x_train_with-bias-mitigation.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/x_test_with-bias-mitigation.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/y_train_with-bias-mitigation.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/y_test_with-bias-mitigation.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/z_train_with-bias-mitigation.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/z_test_with-bias-mitigation.csv


# Define the Model

In [None]:
def model_training(x, y, feature_selection, epochs, batch_size):
    x_train, x_test = x[0], x[1]
    y_train, y_test = y[0], y[1]

    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.to_numpy().flatten())
    class_weights = dict(zip(np.unique(y_train), class_weights))
    print(f"class_weights = {class_weights}")

    x_train = x_train[feature_selection].to_numpy()
    x_test  = x_test[feature_selection].to_numpy()
    y_train = y_train.to_numpy()
    y_test  = y_test.to_numpy()

    feature_count = len(feature_selection)

    # define the keras model
    model = Sequential()
    model.add(Dense(8, input_shape=(feature_count,), activation='tanh'))
    model.add(Dense(1, activation='sigmoid'))

    metrics = ['accuracy']

    # compile the keras model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)

    # print the model summary
    print()
    model.summary()
    print()

    # fit the keras model on the dataset
    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, class_weight=class_weights)

    print()
    y_pred = (model.predict(x_test) > 0.5).astype("int32")
    print(f"y_pred.shape = {y_pred.shape}")
    print()
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    print()
    tn, fp, fn, tp = cm.ravel() # from the docs
    print(f"tp = {tp:8} ~ {tp/cm.sum()*100:06.3f}%")
    print(f"tn = {tn:8} ~ {tn/cm.sum()*100:06.3f}%")
    print(f"fp = {fp:8} ~ {fp/cm.sum()*100:06.3f}%")
    print(f"fn = {fn:8} ~ {fn/cm.sum()*100:06.3f}%")
    print()
    print("Sensitivity (true positive rate) refers to the probability of a positive test, conditioned on truly being positive.")
    print(f"Sensitivity (tpr) = {tp:8} / {tp+fn:8} ~ {tp/(tp+fn)*100:06.3f}%")
    print("Specificity (true negative rate) refers to the probability of a negative test, conditioned on truly being negative.")
    print(f"Specificity (tnr) = {tn:8} / {tn+fp:8} ~ {tn/(tn+fp)*100:06.3f}%")

    return model, y_pred

# Execution

In [None]:
feature_list = [
    'loan_type',
    'loan_purpose',
    'business_or_commercial_purpose',
    'loan_to_value_ratio',
    #'interest_rate',
    'hoepa_status',
    #'loan_term',
    'property_value',
    'construction_method', 
    'occupancy_type',
    'income',
    'debt_to_income_ratio',
    'submission_of_application',
    'aus-1'
]

model, prediction = model_training((x_train, x_test), (y_train, y_test), feature_list, epochs=10, batch_size=256)

class_weights = {0: 3.388528462986725, 1: 0.586549259667499}

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 104       
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 113
Trainable params: 113
Non-trainable params: 0
_________________________________________________________________

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

y_pred.shape = (188377, 1)

[[ 27330    558]
 [  9838 150651]]

tp =   150651 ~ 79.973%
tn =    27330 ~ 14.508%
fp =      558 ~ 00.296%
fn =     9838 ~ 05.223%

Sensitivity (true positive rate) refers to the probability of a positive test, conditioned on truly being positive.
Se

# Post Processing

In [None]:
df_output = pd.DataFrame()

for column in x_test:
    df_output[column] = x_test[column]

df_output['label_value'] = y_test

df_output['score'] = prediction

for column in z_test:
    df_output[column] = z_test[column]

df_output = df_output.reset_index(drop=True)

In [None]:
filename = path + 'df_output_with-bias-mitigation.csv'
df_output.to_csv(filename, index=False)
print('Data Saved to:', filename)

Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df_output_with-bias-mitigation.csv


# Bias Measurements

In [None]:
approved_list = df.action_taken == 1
approval_count = len(df[approved_list].reset_index(drop=True))
approval_rate = approval_count / len(df) * 100
print(f"the overall approval rate is {approval_rate:05.2f}%, which is {approval_count} approvals over {len(df)} applications")

the overall approval rate is 85.23%, which is 642233 approvals over 753510 applications


In [None]:
def approval_rate_by_chosen_field(df, chosen_field, ref):
    field_group = df.groupby(by=chosen_field)
    #print(field_group.size())
    #print(field_group.size().sum())

    group_count = len(field_group)
    print(f"We have {group_count} groups of values in '{chosen_field}'")
    print()

    TP = [None] * group_count
    TN = [None] * group_count
    FP = [None] * group_count
    FN = [None] * group_count
    PP = [None] * group_count
    PN = [None] * group_count
    P  = [None] * group_count
    N  = [None] * group_count

    TPR = [None] * group_count
    FPR = [None] * group_count
    TNR = [None] * group_count
    FNR = [None] * group_count
    FDR = [None] * group_count
    FOR = [None] * group_count
    PPV = [None] * group_count
    NPV = [None] * group_count

    GSR  = [None] * group_count
    PPR  = [None] * group_count
    PPGR = [None] * group_count

    for i, group_name in enumerate(field_group.groups):
        # find the index of 'ref' on our first loop through the groups.
        #   used in the next loop to give disparity rates.
        if group_name == ref: j = i

        y = df[chosen_field] == group_name
        y_pred = df[y]['score'].reset_index(drop=True)
        y_test = df[y]['label_value'].reset_index(drop=True)
        cm = confusion_matrix(y_test, y_pred)

        TN[i], FP[i], FN[i], TP[i] = cm.ravel() # from the docs
        PP[i] = FP[i] + TP[i] # used
        PN[i] = FN[i] + TN[i] # used
        P[i]  = TP[i] + FN[i] # used
        N[i]  = FP[i] + TN[i] # used

        #TPR[i] = TP[i] /  P[i]
        FPR[i] = FP[i] /  N[i] # used
        #TNR[i] = TN[i] /  N[i]
        FNR[i] = FN[i] /  P[i] # used
        FDR[i] = FP[i] / PP[i] # used
        FOR[i] = FN[i] / PN[i] # used
        #PPV[i] = TP[i] / PP[i]
        #NPV[i] = TN[i] / PN[i]

    print('Group Metrics---------------------------------------------------------------------------------------------------------------------------')
    for i, group_name in enumerate(field_group.groups):
        group_size = len(df[df[chosen_field] == group_name])
        GSR[i]  = group_size/len(df)
        PPR[i]  = PP[i]/sum(PP)
        PPGR[i] = PP[i]/group_size

        print(f"{group_name:50}  ", end='')
        print(f"GSR = {GSR[i]:04.2f}  ", end='')
        print(f"PPR = {PPR[i]:04.2f}  ", end='')
        print(f"PPGR = {PPGR[i]:04.2f}  ", end='')
        print(f"FDR = {FDR[i]:04.2f}  ", end='')
        print(f"FPR = {FPR[i]:04.2f}  ", end='')
        print(f"FOR = {FOR[i]:04.2f}  ", end='')
        print(f"FNR = {FNR[i]:04.2f}  ", end='')
        print()

    print('Disparity/Bias Metrics------------------------------------------------------------------------------------------------------------------')
    for i, group_name in enumerate(field_group.groups):
        print(f"{group_name:50}  ", end='')
        print(f"PPR_d  = {PPR[i] / PPR[j]:04.2f}  ", end='')
        print(f"PPGR_d = {PPGR[i] / PPGR[j]:04.2f}  ", end='')
        print(f"FDR_d = {FDR[i] / FDR[j]:04.2f}  ", end='')
        print(f"FPR_d = {FPR[i] / FPR[j]:04.2f}  ", end='')
        print(f"FOR_d = {FOR[i] / FOR[j]:04.2f}  ", end='')
        print(f"FNR_d = {FNR[i] / FNR[j]:04.2f}  ", end='')
        print()


In [None]:
approval_rate_by_chosen_field(df_output, chosen_field = 'derived_sex', ref='Male')

We have 3 groups of values in 'derived_sex'

Group Metrics---------------------------------------------------------------------------------------------------------------------------
Female                                              GSR = 0.22  PPR = 0.21  PPGR = 0.78  FDR = 0.00  FPR = 0.02  FOR = 0.24  FNR = 0.06  
Joint                                               GSR = 0.49  PPR = 0.51  PPGR = 0.83  FDR = 0.00  FPR = 0.02  FOR = 0.30  FNR = 0.06  
Male                                                GSR = 0.30  PPR = 0.28  PPGR = 0.77  FDR = 0.00  FPR = 0.02  FOR = 0.24  FNR = 0.07  
Disparity/Bias Metrics------------------------------------------------------------------------------------------------------------------
Female                                              PPR_d  = 0.74  PPGR_d = 1.01  FDR_d = 0.88  FPR_d = 0.92  FOR_d = 1.00  FNR_d = 0.96  
Joint                                               PPR_d  = 1.78  PPGR_d = 1.08  FDR_d = 0.79  FPR_d = 1.25  FOR_d = 1.22  FNR_

In [None]:
approval_rate_by_chosen_field(df_output, chosen_field = 'derived_race', ref='White')

We have 7 groups of values in 'derived_race'

Group Metrics---------------------------------------------------------------------------------------------------------------------------
2 or more minority races                            GSR = 0.00  PPR = 0.00  PPGR = 0.73  FDR = 0.01  FPR = 0.02  FOR = 0.13  FNR = 0.05  
American Indian or Alaska Native                    GSR = 0.01  PPR = 0.01  PPGR = 0.70  FDR = 0.01  FPR = 0.02  FOR = 0.20  FNR = 0.08  
Asian                                               GSR = 0.05  PPR = 0.05  PPGR = 0.79  FDR = 0.01  FPR = 0.03  FOR = 0.24  FNR = 0.06  
Black or African American                           GSR = 0.01  PPR = 0.01  PPGR = 0.73  FDR = 0.01  FPR = 0.03  FOR = 0.24  FNR = 0.08  
Joint                                               GSR = 0.03  PPR = 0.04  PPGR = 0.83  FDR = 0.00  FPR = 0.02  FOR = 0.25  FNR = 0.05  
Native Hawaiian or Other Pacific Islander           GSR = 0.00  PPR = 0.00  PPGR = 0.71  FDR = 0.00  FPR = 0.01  FOR = 0.22  FN

In [None]:
approval_rate_by_chosen_field(df_output, chosen_field = 'derived_ethnicity', ref='Not Hispanic or Latino')

We have 3 groups of values in 'derived_ethnicity'

Group Metrics---------------------------------------------------------------------------------------------------------------------------
Hispanic or Latino                                  GSR = 0.05  PPR = 0.04  PPGR = 0.75  FDR = 0.00  FPR = 0.01  FOR = 0.19  FNR = 0.06  
Joint                                               GSR = 0.03  PPR = 0.03  PPGR = 0.82  FDR = 0.00  FPR = 0.01  FOR = 0.25  FNR = 0.05  
Not Hispanic or Latino                              GSR = 0.92  PPR = 0.93  PPGR = 0.81  FDR = 0.00  FPR = 0.02  FOR = 0.27  FNR = 0.06  
Disparity/Bias Metrics------------------------------------------------------------------------------------------------------------------
Hispanic or Latino                                  PPR_d  = 0.05  PPGR_d = 0.93  FDR_d = 0.58  FPR_d = 0.38  FOR_d = 0.69  FNR_d = 0.96  
Joint                                               PPR_d  = 0.03  PPGR_d = 1.01  FDR_d = 0.57  FPR_d = 0.59  FOR_d = 0.91

# Report

http://aequitas.dssg.io/audit/tf3owqzf/df_output_with-bias-mitigation/report-1.html