In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Problem Statement

## Background

"The Home Mortgage Disclosure Act (HMDA) requires many financial institutions to maintain, report, and publicly disclose loan-level information about mortgages. These data help show whether lenders are serving the housing needs of their communities; they give public officials information that helps them make decisions and policies; and they shed light on lending patterns that could be discriminatory. The public data are modified to protect applicant and borrower privacy.

HMDA was originally enacted by Congress in 1975 and is implemented by Regulation C." - [source](https://www.consumerfinance.gov/data-research/hmda/)

## Resources

[Download the Dataset](https://ffiec.cfpb.gov/data-browser/data/2021?category=states&items=OR)

[2021 HMDI Documentation](https://ffiec.cfpb.gov/documentation/2021/)

[2021 data-feild specification](https://ffiec.cfpb.gov/documentation/2021/lar-data-fields/)

[2021 HMDA Guide](https://www.ffiec.gov/hmda/pdf/2021Guide.pdf)


## Design

classifier 1: predict "action taken" column which is either


* Labels
    * Positive
        * code 1 - Loan originated 
        * code 2 - Approved but not accepted 
          (Note: this counts, because we are focused on loan /approval/, more than loans approved-and-accepted)
    * Negative
        * code 3 - Loan Denied 
          (See denail-reason for more information)


* Feature columns
    * Business_or_commercial_purpose
    * loan_to_value_ratio (aka Combined_loan_to_value_ratio)
    * interest_rate
    * hoepa_status
    * loan_term
    * property_value
    * construction_method
    * property_value
    * occupancy_type
    * income
    * debt_to_income_ratio
    * submission_of_application
    * aus-1


* protected classes:
    * race
    * ethnicity
    * gender
    * age
    * tract_minority_population_percent
    * other "tract" columns

# Data Loading

In [3]:
path = '/content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/'

In [4]:
df_2021 = pd.read_csv(path + '2021_state_OR.csv')
df_2020 = pd.read_csv(path + '2020_state_OR.csv')
df_2019 = pd.read_csv(path + '2019_state_OR.csv')
df_2018 = pd.read_csv(path + '2018_state_OR.csv')

  df_2021 = pd.read_csv(path + '2021_state_OR.csv')
  df_2020 = pd.read_csv(path + '2020_state_OR.csv')
  df_2019 = pd.read_csv(path + '2019_state_OR.csv')
  df_2018 = pd.read_csv(path + '2018_state_OR.csv')


In [5]:
print(f"df_2021 = {len(df_2021)}")
print(f"df_2020 = {len(df_2020)}")
print(f"df_2019 = {len(df_2019)}")
print(f"df_2018 = {len(df_2018)}")

df_2021 = 391601
df_2020 = 400029
df_2019 = 265274
df_2018 = 229722


In [6]:
for i, col in enumerate(df_2021.columns):
    
    if(df_2020.columns[i] != col):
        print(f"df_2021[{i:02.0f}] = {col}")
        print(f"df_2020[{i:02.0f}] = {df_2020.columns[i]}")
    if(df_2019.columns[i] != col):
        print(f"df_2021[{i:02.0f}] = {col}")
        print(f"df_2019[{i:02.0f}] = {df_2019.columns[i]}")
    if(df_2018.columns[i] != col):
        print(f"df_2021[{i:02.0f}] = {col}")
        print(f"df_2018[{i:02.0f}] = {df_2018.columns[i]}")

In [7]:
frames = [df_2021, df_2020, df_2019, df_2018]
df_read = pd.concat(frames).reset_index(drop=True)

In [8]:
df = df_read[[
        # Label Field
        'action_taken',

        # Input Fields
        'loan_type',
        'loan_purpose',
        'business_or_commercial_purpose',
        'loan_to_value_ratio',
        'interest_rate',
        'hoepa_status',
        'loan_term',
        'property_value',
        'construction_method',
        'occupancy_type',
        'income',
        'debt_to_income_ratio',
        'submission_of_application',
        'aus-1',

        # Audit Feilds
        'derived_sex',
        'derived_race',
        'derived_ethnicity'
       ]]

In [9]:
df = df.sample(frac=1)

# Data Filtering

In [10]:
df = df.drop(df[df.action_taken > 3].index).reset_index(drop=True)

df = df.drop(df[df.derived_race == 'Free Form Text Only'].index).reset_index(drop=True)
df = df.drop(df[df.derived_race == 'Race Not Available'].index).reset_index(drop=True)

df = df.drop(df[df.derived_ethnicity == 'Free Form Text Only'].index).reset_index(drop=True)
df = df.drop(df[df.derived_ethnicity == 'Ethnicity Not Available'].index).reset_index(drop=True)

df = df.drop(df[df.derived_sex == 'Sex Not Available'].index).reset_index(drop=True)

# Data Preprocessor 

In [11]:
input_features = [
        'loan_type',
        'loan_purpose',
        'business_or_commercial_purpose',
        'loan_to_value_ratio',
        'interest_rate',
        'hoepa_status',
        'loan_term',
        'property_value',
        'construction_method',
        'occupancy_type',
        'income',
        'debt_to_income_ratio',
        'submission_of_application',
        'aus-1'
]

label_features = ['action_taken']

audit_features = [
    'derived_sex',
    'derived_race',
    'derived_ethnicity'
]

In [12]:
df['action_taken'] = df['action_taken'].replace(1, 1)
df['action_taken'] = df['action_taken'].replace(2, 1)
df['action_taken'] = df['action_taken'].replace(3, 0)

df['loan_type'] = df['loan_type'].apply(pd.to_numeric)

df['loan_purpose'] = df['loan_purpose'].replace(31, 6)
df['loan_purpose'] = df['loan_purpose'].replace(32, 7)
df['loan_purpose'] = df['loan_purpose'].fillna(0)
df['loan_purpose'] = df['loan_purpose'].apply(pd.to_numeric)

df['business_or_commercial_purpose'] = df['business_or_commercial_purpose'].replace(1111, 0)
df['business_or_commercial_purpose'] = df['business_or_commercial_purpose'].fillna(0)
df['business_or_commercial_purpose'] = df['business_or_commercial_purpose'].apply(pd.to_numeric)

df['loan_to_value_ratio'] = df['loan_to_value_ratio'].replace('Exempt', 0.0)
df['loan_to_value_ratio'] = df['loan_to_value_ratio'].fillna(0.0)
df['loan_to_value_ratio'] = df['loan_to_value_ratio'].apply(pd.to_numeric, errors='coerce')

df['interest_rate'] = df['interest_rate'].replace('Exempt', -1.0)
df['interest_rate'] = df['interest_rate'].fillna(-1)
df['interest_rate'] = df['interest_rate'].apply(pd.to_numeric, errors='coerce')

df['hoepa_status'] = df['hoepa_status'].fillna(0)
df['hoepa_status'] = df['hoepa_status'].apply(pd.to_numeric, errors='coerce')

df['loan_term'] = df['loan_term'].replace('Exempt', 0)
df['loan_term'] = df['loan_term'].fillna(0)
df['loan_term'] = df['loan_term'].apply(pd.to_numeric, errors='coerce')

df['property_value'] = df['property_value'].replace('Exempt', 0)
df['property_value'] = df['property_value'].fillna(0)
df['property_value'] = df['property_value'].apply(pd.to_numeric, errors='coerce')

df['construction_method'] = df['construction_method'].fillna(0)
df['construction_method'] = df['construction_method'].apply(pd.to_numeric, errors='coerce')

df['income'] = df['income'].fillna(0)
df['occupancy_type'] = df['occupancy_type'].apply(pd.to_numeric, errors='coerce')

df['income'] = df['income'].fillna(0)
df['income'] = df['income'].apply(pd.to_numeric, errors='coerce')

df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('Exempt',   0)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('20%-<30%', 1)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('30%-<36%', 2)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('36',       3)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('37',       4)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('38',       5)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('39',       6)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('40',       7)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('41',       8)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('42',       9)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('43',      10)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('44',      11)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('45',      12)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('46',      13)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('47',      14)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('48',      15)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('49',      16)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('50%-60%', 17)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('<20%',    18)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].replace('>60%',    19)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].fillna(0)
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].apply(pd.to_numeric, errors='coerce')

df['submission_of_application'] = df['submission_of_application'].replace(1111,  0)
df['submission_of_application'] = df['submission_of_application'].fillna(0)
df['submission_of_application'] = df['submission_of_application'].apply(pd.to_numeric, errors='coerce')

df['aus-1'] = df['aus-1'].replace(1111,  0)
df['aus-1'] = df['aus-1'].fillna(0)
df['aus-1'] = df['aus-1'].apply(pd.to_numeric, errors='coerce')

In [13]:
df[['loan_type']]                      = StandardScaler().fit_transform(df[['loan_type']])
df[['loan_purpose']]                   = StandardScaler().fit_transform(df[['loan_purpose']])
df[['business_or_commercial_purpose']] = StandardScaler().fit_transform(df[['business_or_commercial_purpose']])
df[['loan_to_value_ratio']]            = StandardScaler().fit_transform(df[['loan_to_value_ratio']])
df[['interest_rate']]                  = StandardScaler().fit_transform(df[['interest_rate']])
df[['hoepa_status']]                   = StandardScaler().fit_transform(df[['hoepa_status']])
df[['loan_term']]                      = StandardScaler().fit_transform(df[['loan_term']])
df[['property_value']]                 = StandardScaler().fit_transform(df[['property_value']])
df[['construction_method']]            = StandardScaler().fit_transform(df[['construction_method']])
df[['occupancy_type']]                 = StandardScaler().fit_transform(df[['occupancy_type']])
df[['income']]                         = StandardScaler().fit_transform(df[['income']])
df[['debt_to_income_ratio']]           = StandardScaler().fit_transform(df[['debt_to_income_ratio']])
df[['submission_of_application']]      = StandardScaler().fit_transform(df[['submission_of_application']])
df[['aus-1']]                          = StandardScaler().fit_transform(df[['aus-1']])

In [14]:
for feature in input_features:
    print(f"feature {feature:32} mean/std = {df[feature].mean():+02.2f} / {df[feature].std():+02.2f}")

feature loan_type                        mean/std = +0.00 / +1.00
feature loan_purpose                     mean/std = -0.00 / +1.00
feature business_or_commercial_purpose   mean/std = +0.00 / +1.00
feature loan_to_value_ratio              mean/std = +0.00 / +1.00
feature interest_rate                    mean/std = +0.00 / +1.00
feature hoepa_status                     mean/std = +0.00 / +1.00
feature loan_term                        mean/std = +0.00 / +1.00
feature property_value                   mean/std = -0.00 / +1.00
feature construction_method              mean/std = -0.00 / +1.00
feature occupancy_type                   mean/std = -0.00 / +1.00
feature income                           mean/std = -0.00 / +1.00
feature debt_to_income_ratio             mean/std = +0.00 / +1.00
feature submission_of_application        mean/std = -0.00 / +1.00
feature aus-1                            mean/std = -0.00 / +1.00


In [15]:
df.dtypes

action_taken                        int64
loan_type                         float64
loan_purpose                      float64
business_or_commercial_purpose    float64
loan_to_value_ratio               float64
interest_rate                     float64
hoepa_status                      float64
loan_term                         float64
property_value                    float64
construction_method               float64
occupancy_type                    float64
income                            float64
debt_to_income_ratio              float64
submission_of_application         float64
aus-1                             float64
derived_sex                        object
derived_race                       object
derived_ethnicity                  object
dtype: object

In [16]:
filename = path + 'df.csv'
df.to_csv(filename, index=False)
print('Data Saved to:', filename)

Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df.csv


# Data Splitting

In [17]:
df_input = df[input_features]
df_label = df[label_features]
df_audit = df[audit_features]

In [18]:
filename = path + 'df-input.csv'
df_input.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'df-label.csv'
df_label.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'df-audit.csv'
df_audit.to_csv(filename, index=False)
print('Data Saved to:', filename)

Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df-input.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df-label.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df-audit.csv


In [19]:
n = len(df)
split_index = int(0.75 * n)
x_train = df_input[:split_index]
x_test  = df_input[split_index+1:]
y_train = df_label[:split_index]
y_test  = df_label[split_index+1:]
z_train = df_audit[:split_index]
z_test  = df_audit[split_index+1:]

print(f"x_train = {x_train.shape}")
print(f"y_train = {y_train.shape}")
print(f"z_train = {z_train.shape}")
print(f"x_test  = {x_test.shape}")
print(f"y_test  = {y_test.shape}")
print(f"z_test  = {z_test.shape}")

x_train = (565132, 14)
y_train = (565132, 1)
z_train = (565132, 3)
x_test  = (188377, 14)
y_test  = (188377, 1)
z_test  = (188377, 3)


In [20]:
filename = path + 'x_train.csv'
x_train.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'x_test.csv'
x_test.to_csv(filename, index=False)
print('Data Saved to:', filename)


filename = path + 'y_train.csv'
y_train.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'y_test.csv'
y_test.to_csv(filename, index=False)
print('Data Saved to:', filename)


filename = path + 'z_train.csv'
z_train.to_csv(filename, index=False)
print('Data Saved to:', filename)

filename = path + 'z_test.csv'
z_test.to_csv(filename, index=False)
print('Data Saved to:', filename)

Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/x_train.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/x_test.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/y_train.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/y_test.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/z_train.csv
Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/z_test.csv


# Define the Model

In [21]:
def model_training(x, y, feature_selection, epochs, batch_size):
    x_train, x_test = x[0], x[1]
    y_train, y_test = y[0], y[1]

    x_train = x_train[feature_selection].to_numpy()
    x_test  = x_test[feature_selection].to_numpy()
    y_train = y_train.to_numpy()
    y_test  = y_test.to_numpy()

    feature_count = len(feature_selection)

    # define the keras model
    model = Sequential()
    model.add(Dense(8, input_shape=(feature_count,), activation='tanh'))
    model.add(Dense(1, activation='sigmoid'))

    metrics = ['accuracy']

    # compile the keras model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)

    # print the model summary
    print()
    model.summary()
    print()

    # fit the keras model on the dataset
    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

    print()
    y_pred = (model.predict(x_test) > 0.5).astype("int32")
    print(f"y_pred.shape = {y_pred.shape}")
    print()
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    print()
    tn, fp, fn, tp = cm.ravel() # from the docs
    print(f"tp = {tp:8} ~ {tp/cm.sum()*100:06.3f}%")
    print(f"tn = {tn:8} ~ {tn/cm.sum()*100:06.3f}%")
    print(f"fp = {fp:8} ~ {fp/cm.sum()*100:06.3f}%")
    print(f"fn = {fn:8} ~ {fn/cm.sum()*100:06.3f}%")
    print()
    print("Sensitivity (true positive rate) refers to the probability of a positive test, conditioned on truly being positive.")
    print(f"Sensitivity (tpr) = {tp:8} / {tp+fn:8} ~ {tp/(tp+fn)*100:06.3f}%")
    print("Specificity (true negative rate) refers to the probability of a negative test, conditioned on truly being negative.")
    print(f"Specificity (tnr) = {tn:8} / {tn+fp:8} ~ {tn/(tn+fp)*100:06.3f}%")

    return model, y_pred


# Execution

In [22]:
feature_list = [
    'loan_type',
    'loan_purpose',
    'business_or_commercial_purpose',
    'loan_to_value_ratio',
    #'interest_rate',
    'hoepa_status',
    #'loan_term',
    'property_value',
    'construction_method', 
    'occupancy_type',
    'income',
    'debt_to_income_ratio',
    'submission_of_application',
    'aus-1'
]

model, prediction = model_training((x_train, x_test), (y_train, y_test), feature_list, epochs=10, batch_size=256)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 104       
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 113
Trainable params: 113
Non-trainable params: 0
_________________________________________________________________

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

y_pred.shape = (188377, 1)

[[ 25047   2830]
 [  5725 154775]]

tp =   154775 ~ 82.162%
tn =    25047 ~ 13.296%
fp =     2830 ~ 01.502%
fn =     5725 ~ 03.039%

Sensitivity (true positive rate) refers to the probability of a positive test, conditioned on truly being positive.
Sensitivity (tpr) =   154775 /   160500 ~ 96.433%
Specificity (

# Post Processing

In [23]:
df_output = pd.DataFrame()

for column in x_test:
    df_output[column] = x_test[column]

df_output['label_value'] = y_test

df_output['score'] = prediction

for column in z_test:
    df_output[column] = z_test[column]

df_output = df_output.reset_index(drop=True)

In [24]:
filename = path + 'df-output.csv'
df_output.to_csv(filename, index=False)
print('Data Saved to:', filename)

Data Saved to: /content/drive/MyDrive/CS510_CulturalCompetenceInComputing/Final submission/df-output.csv


# Bias Measurements

In [25]:
approved_list = df.action_taken == 1
approval_count = len(df[approved_list].reset_index(drop=True))
approval_rate = approval_count / len(df) * 100
print(f"the overall approval rate is {approval_rate:05.2f}%, which is {approval_count} approvals over {len(df)} applications")

the overall approval rate is 85.23%, which is 642233 approvals over 753510 applications


In [26]:
def approval_rate_by_chosen_field(df, chosen_field, ref):
    field_group = df.groupby(by=chosen_field)
    #print(field_group.size())
    #print(field_group.size().sum())

    group_count = len(field_group)
    print(f"We have {group_count} groups of values in '{chosen_field}'")
    print()

    TP = [None] * group_count
    TN = [None] * group_count
    FP = [None] * group_count
    FN = [None] * group_count
    PP = [None] * group_count
    PN = [None] * group_count
    P  = [None] * group_count
    N  = [None] * group_count

    TPR = [None] * group_count
    FPR = [None] * group_count
    TNR = [None] * group_count
    FNR = [None] * group_count
    FDR = [None] * group_count
    FOR = [None] * group_count
    PPV = [None] * group_count
    NPV = [None] * group_count

    GSR  = [None] * group_count
    PPR  = [None] * group_count
    PPGR = [None] * group_count

    for i, group_name in enumerate(field_group.groups):
        # find the index of 'ref' on our first loop through the groups.
        #   used in the next loop to give disparity rates.
        if group_name == ref: j = i

        y = df[chosen_field] == group_name
        y_pred = df[y]['score'].reset_index(drop=True)
        y_test = df[y]['label_value'].reset_index(drop=True)
        cm = confusion_matrix(y_test, y_pred)

        TN[i], FP[i], FN[i], TP[i] = cm.ravel() # from the docs
        PP[i] = FP[i] + TP[i] # used
        PN[i] = FN[i] + TN[i] # used
        P[i]  = TP[i] + FN[i] # used
        N[i]  = FP[i] + TN[i] # used

        #TPR[i] = TP[i] /  P[i]
        FPR[i] = FP[i] /  N[i] # used
        #TNR[i] = TN[i] /  N[i]
        FNR[i] = FN[i] /  P[i] # used
        FDR[i] = FP[i] / PP[i] # used
        FOR[i] = FN[i] / PN[i] # used
        #PPV[i] = TP[i] / PP[i]
        #NPV[i] = TN[i] / PN[i]

    print('Group Metrics---------------------------------------------------------------------------------------------------------------------------')
    for i, group_name in enumerate(field_group.groups):
        group_size = len(df[df[chosen_field] == group_name])
        GSR[i]  = group_size/len(df)
        PPR[i]  = PP[i]/sum(PP)
        PPGR[i] = PP[i]/group_size

        print(f"{group_name:50}  ", end='')
        print(f"GSR = {GSR[i]:04.2f}  ", end='')
        print(f"PPR = {PPR[i]:04.2f}  ", end='')
        print(f"PPGR = {PPGR[i]:04.2f}  ", end='')
        print(f"FDR = {FDR[i]:04.2f}  ", end='')
        print(f"FPR = {FPR[i]:04.2f}  ", end='')
        print(f"FOR = {FOR[i]:04.2f}  ", end='')
        print(f"FNR = {FNR[i]:04.2f}  ", end='')
        print()

    print('Disparity/Bias Metrics------------------------------------------------------------------------------------------------------------------')
    for i, group_name in enumerate(field_group.groups):
        print(f"{group_name:50}  ", end='')
        print(f"PPR_d  = {PPR[i] / PPR[j]:04.2f}  ", end='')
        print(f"PPGR_d = {PPGR[i] / PPGR[j]:04.2f}  ", end='')
        print(f"FDR_d = {FDR[i] / FDR[j]:04.2f}  ", end='')
        print(f"FPR_d = {FPR[i] / FPR[j]:04.2f}  ", end='')
        print(f"FOR_d = {FOR[i] / FOR[j]:04.2f}  ", end='')
        print(f"FNR_d = {FNR[i] / FNR[j]:04.2f}  ", end='')
        print()


In [27]:
approval_rate_by_chosen_field(df_output, chosen_field = 'derived_sex', ref='Male')

We have 3 groups of values in 'derived_sex'

Group Metrics---------------------------------------------------------------------------------------------------------------------------
Female                                              GSR = 0.22  PPR = 0.21  PPGR = 0.81  FDR = 0.02  FPR = 0.09  FOR = 0.17  FNR = 0.04  
Joint                                               GSR = 0.49  PPR = 0.50  PPGR = 0.86  FDR = 0.02  FPR = 0.11  FOR = 0.21  FNR = 0.03  
Male                                                GSR = 0.30  PPR = 0.29  PPGR = 0.81  FDR = 0.02  FPR = 0.10  FOR = 0.17  FNR = 0.04  
Disparity/Bias Metrics------------------------------------------------------------------------------------------------------------------
Female                                              PPR_d  = 0.74  PPGR_d = 1.01  FDR_d = 0.87  FPR_d = 0.90  FOR_d = 0.99  FNR_d = 0.97  
Joint                                               PPR_d  = 1.76  PPGR_d = 1.07  FDR_d = 0.69  FPR_d = 1.08  FOR_d = 1.21  FNR_

In [28]:
approval_rate_by_chosen_field(df_output, chosen_field = 'derived_race', ref='White')

We have 7 groups of values in 'derived_race'

Group Metrics---------------------------------------------------------------------------------------------------------------------------
2 or more minority races                            GSR = 0.00  PPR = 0.00  PPGR = 0.80  FDR = 0.03  FPR = 0.12  FOR = 0.08  FNR = 0.02  
American Indian or Alaska Native                    GSR = 0.01  PPR = 0.01  PPGR = 0.73  FDR = 0.02  FPR = 0.07  FOR = 0.13  FNR = 0.05  
Asian                                               GSR = 0.05  PPR = 0.05  PPGR = 0.82  FDR = 0.03  FPR = 0.14  FOR = 0.16  FNR = 0.03  
Black or African American                           GSR = 0.01  PPR = 0.01  PPGR = 0.78  FDR = 0.03  FPR = 0.13  FOR = 0.17  FNR = 0.05  
Joint                                               GSR = 0.03  PPR = 0.04  PPGR = 0.86  FDR = 0.02  FPR = 0.10  FOR = 0.19  FNR = 0.03  
Native Hawaiian or Other Pacific Islander           GSR = 0.00  PPR = 0.00  PPGR = 0.71  FDR = 0.02  FPR = 0.05  FOR = 0.11  FN

In [29]:
approval_rate_by_chosen_field(df_output, chosen_field = 'derived_ethnicity', ref='Not Hispanic or Latino')

We have 3 groups of values in 'derived_ethnicity'

Group Metrics---------------------------------------------------------------------------------------------------------------------------
Hispanic or Latino                                  GSR = 0.05  PPR = 0.04  PPGR = 0.78  FDR = 0.02  FPR = 0.09  FOR = 0.14  FNR = 0.04  
Joint                                               GSR = 0.03  PPR = 0.03  PPGR = 0.85  FDR = 0.01  FPR = 0.08  FOR = 0.17  FNR = 0.03  
Not Hispanic or Latino                              GSR = 0.92  PPR = 0.93  PPGR = 0.84  FDR = 0.02  FPR = 0.10  FOR = 0.19  FNR = 0.04  
Disparity/Bias Metrics------------------------------------------------------------------------------------------------------------------
Hispanic or Latino                                  PPR_d  = 0.05  PPGR_d = 0.93  FDR_d = 1.40  FPR_d = 0.90  FOR_d = 0.75  FNR_d = 1.12  
Joint                                               PPR_d  = 0.03  PPGR_d = 1.01  FDR_d = 0.78  FPR_d = 0.83  FOR_d = 0.88

# Report

http://aequitas.dssg.io/audit/tf3owqzf/df_output_with-bias-mitigation/report-1.html