In [2]:
import pandas as pd
import numpy as np
%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# Fairness metrics
def CohenD(yobs, ypred, gmaj, gmin):
    ### Cohen-D
    SR_min = ypred[gmin==1].mean() # success rate minority
    SR_maj = ypred[gmaj==1].mean() # success rate majority
    
    STD_maj = np.sqrt(SR_maj * (1.0 - SR_maj))
    STD_min = np.sqrt(SR_min * (1.0 - SR_min))
    POOL_STD = STD_maj * ( sum(gmaj==1)/(sum(gmin==1) + sum(gmaj==1)) ) + STD_min * ( sum(gmin==1)/(sum(gmin==1) + sum(gmaj==1)) )
    
    return StatParity(yobs, ypred, gmaj, gmin)/POOL_STD

def DispImpact(yobs, ypred, gmaj, gmin):
    ### Disparate Impact (a.k.a. Adverse Impact Ratio)
    SR_min = ypred[gmin==1].mean() # success rate minority
    SR_maj = ypred[gmaj==1].mean() # success rate majority
    return SR_min/SR_maj

def StatParity(yobs, ypred, gmaj, gmin):
    ### Statistical Parity Difference
    SR_min = ypred[gmin==1].mean() # success rate minority
    SR_maj = ypred[gmaj==1].mean() # success rate majority
    return SR_min - SR_maj

def TwoSDRule(yobs, ypred, gmaj, gmin):
    ### 2-SD Rule
    SR_min = ypred[gmin==1].mean() # success rate minority
    SR_maj = ypred[gmaj==1].mean() # success rate majority
    SR_T = ypred.mean() # success rate total
    P_min = (gmin==1).mean() # minority proportion
    N = len(ypred)
    return (SR_min - SR_maj)/np.sqrt( (SR_T * (1.0 - SR_T))/(N * P_min * (1 - P_min)) )

def EqualOppDiff(yobs, ypred, gmaj, gmin):
    ### Equal Opportunity Difference
    TPR_maj = sum( (yobs[gmaj==1]==1) * (ypred[gmaj==1]==1) )/sum(yobs[gmaj==1]==1)
    TPR_min = sum( (yobs[gmin==1]==1) * (ypred[gmin==1]==1) )/sum(yobs[gmin==1]==1)
    return TPR_min - TPR_maj

def AvgOddsDiff(yobs, ypred, gmaj, gmin):
    ### Average Odds Difference
    return (EqualOppDiff(yobs==0, ypred==0, gmaj, gmin) + EqualOppDiff(yobs, ypred, gmaj, gmin))/2.0


def compute_model_metrics(yobs, model, Xobs, gmaj=None, gmin=None):
    # metrics
    from sklearn import metrics
    perf_metrics = {"Accuracy": metrics.accuracy_score, 
                    "Precision": metrics.precision_score, 
                    "Recall": metrics.recall_score,
                    "AUC": metrics.roc_auc_score, 
                    "F1-Score": metrics.f1_score, 
                    "Brier": metrics.brier_score_loss
                   }
    # fairness metrics
    fair_metrics = {"Cohen-D": CohenD,
                    "2-SD Rule": TwoSDRule,
                    "StatParity": StatParity,
                    "EqualOppDiff": EqualOppDiff,
                    "DispImpact": DispImpact,
                    "AvgOddsDiff": AvgOddsDiff
                   }
    
    # get predictions -- where you would start, after loading the data and model
    ypred_prob = model.predict_proba(Xobs).ravel()[1::2] # get probabilities
    ypred_class = model.predict(Xobs)
    
    # compute performance metrics
    metrics = []
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            metrics += [[pf, perf_metrics[pf](yobs, ypred_prob)]]
        else:
            metrics += [[pf, perf_metrics[pf](yobs, ypred_class)]]
            
    if (gmaj is not None) and (gmin is not None):
        for ff in fair_metrics.keys():
            metrics += [[ff, fair_metrics[ff](yobs, ypred_class, gmaj, gmin)]]
    
    return pd.DataFrame(metrics, columns=["Metric", "Value"])

### Fetch Data

In [4]:
df = pd.read_pickle("mortgage_data_balanced.pkl.gz")
keep_vars = ['respondent_id', 'as_of_year', 'agency_abbr', 'loan_type_name', 'loan_amount_000s', 'owner_occupancy_name',
             'loan_purpose_name', 'property_type_name', 'preapproval_name', 'msamd_name', 'state_abbr', 'county_name',
             'applicant_ethnicity_name', 'co_applicant_ethnicity_name', 'applicant_race_name_1', 'co_applicant_race_name_1',
             'applicant_sex_name', 'co_applicant_sex_name', 'applicant_income_000s', 'purchaser_type_name', 
             'denial_reason_name_1', 'hoepa_status_name', 'lien_status_name', 'population', 'minority_population',
             'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 
             'number_of_1_to_4_family_units', 'action_taken_name']

df = df[keep_vars].copy()

### Organize Features

In [5]:
# categorical variables
cat_variables = cat_variables = ['applicant_ethnicity_name', 'applicant_race_name_1', 'applicant_sex_name', 'agency_abbr',
                                 'owner_occupancy_name', 'property_type_name', 'loan_purpose_name', 'loan_type_name']

# other integer variables
int_variables = ['loan_amount_000s', 'applicant_income_000s', 'population', 'minority_population', 
                 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 
                 'number_of_1_to_4_family_units']

# target variable
output_variable = ['action_taken_name']

### Pre-processing
# Mapping categorical variables to one-hot encoding
df_cat = pd.DataFrame(index=df.index)

# one-hot encoding of categorical variables
from sklearn.preprocessing import OneHotEncoder

# I will do a loop for pedagogical reasons, but it is not entirely necessary
for cat in cat_variables:
    # one-hot encoding fitting
    one_hot_func = OneHotEncoder().fit(df[[cat]])
    
    # mapping
    cat_mapped = one_hot_func.transform(df[[cat]]).toarray()
    
    # storing
    for (k, cat_label) in enumerate(one_hot_func.categories_[0]):
        df_cat[cat + "_" + cat_label] = cat_mapped[:, k]

# consolidating a final dataset
X = pd.concat([df[int_variables], df_cat], axis=1)
y = (df[output_variable] == "Application denied by financial institution").copy()

# removing race, ethnicity, sex and minority population from X
X_ub = X[[
    'loan_amount_000s', 'applicant_income_000s', 'population', 'hud_median_family_income', 'tract_to_msamd_income', 
    'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'agency_abbr_CFPB', 'agency_abbr_FDIC',
       'agency_abbr_FRS', 'agency_abbr_HUD', 'agency_abbr_NCUA',
       'agency_abbr_OCC', 'owner_occupancy_name_Not applicable',
       'owner_occupancy_name_Not owner-occupied as a principal dwelling',
       'owner_occupancy_name_Owner-occupied as a principal dwelling',
       'property_type_name_Manufactured housing',
       'property_type_name_One-to-four family dwelling (other than manufactured housing)',
       'loan_purpose_name_Home improvement', 'loan_purpose_name_Home purchase',
       'loan_purpose_name_Refinancing', 'loan_type_name_Conventional',
       'loan_type_name_FHA-insured', 'loan_type_name_FSA/RHS-guaranteed',
       'loan_type_name_VA-guaranteed'
]].copy()

# protected attributtes
X_groups = X[[
    'minority_population', 'applicant_ethnicity_name_Hispanic or Latino',
       'applicant_ethnicity_name_Not Hispanic or Latino',
       'applicant_race_name_1_American Indian or Alaska Native',
       'applicant_race_name_1_Asian',
       'applicant_race_name_1_Black or African American',
       'applicant_race_name_1_Native Hawaiian or Other Pacific Islander',
       'applicant_race_name_1_White', 'applicant_sex_name_Female',
       'applicant_sex_name_Male'
]].copy()


In [6]:
X.describe()

Unnamed: 0,loan_amount_000s,applicant_income_000s,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,applicant_ethnicity_name_Hispanic or Latino,applicant_ethnicity_name_Not Hispanic or Latino,...,owner_occupancy_name_Owner-occupied as a principal dwelling,property_type_name_Manufactured housing,property_type_name_One-to-four family dwelling (other than manufactured housing),loan_purpose_name_Home improvement,loan_purpose_name_Home purchase,loan_purpose_name_Refinancing,loan_type_name_Conventional,loan_type_name_FHA-insured,loan_type_name_FSA/RHS-guaranteed,loan_type_name_VA-guaranteed
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,190.134855,103.01879,5786.08478,21.509732,69383.415,116.387654,1616.858665,2074.20092,0.06659,0.93341,...,0.89245,0.030215,0.969785,0.071035,0.288325,0.64064,0.79747,0.15313,0.01401,0.03539
std,181.118295,176.026034,2955.144734,22.01557,13758.824167,38.340362,911.977366,1113.748324,0.249311,0.249311,...,0.309812,0.171179,0.171179,0.256884,0.452984,0.479814,0.401886,0.360114,0.117532,0.184764
min,1.0,1.0,19.0,0.23,16300.0,12.03,5.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,88.0,45.0,3847.0,6.03,60600.0,92.18,1029.0,1362.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,149.0,73.0,5223.0,13.47,66600.0,109.559998,1452.0,1844.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,244.0,118.0,6994.0,28.219999,77000.0,133.009995,1994.0,2497.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
max,14500.0,9999.0,34055.0,100.0,111900.0,430.029999,9880.0,10724.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Train and test LR

In [25]:
# set metrics
# performance metrics
gmaj = X[['applicant_sex_name_Male']]
gmin = X[['applicant_sex_name_Female']]

# train model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
lr = LogisticRegression(random_state=10, class_weight="balanced", solver="lbfgs")
mv = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
k, i = True, 1

for (train, test) in mv.split(X_ub, y):
    # fit model
    lr = lr.fit(X_ub.iloc[train], y.iloc[train].values.ravel())
    
    # compute metrics
    df_m = compute_model_metrics(y.iloc[test].values.ravel(), lr, X_ub.iloc[test], gmaj.iloc[test].values.ravel(), gmin.iloc[test].values.ravel())
    df_m["Fold"] = i
    i += 1
    if k:
        df_metrics = df_m.copy()
        k=0
    else:
        df_metrics = pd.concat([df_metrics, df_m.copy()], axis=0, ignore_index=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [26]:
df_metrics.pivot_table(index="Metric", aggfunc="mean")

Unnamed: 0_level_0,Fold,Value
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
2-SD Rule,5.5,14.26242
AUC,5.5,0.575311
Accuracy,5.5,0.556135
AvgOddsDiff,5.5,0.001847
Brier,5.5,0.247146
Cohen-D,5.5,0.222482
DispImpact,5.5,1.278646
EqualOppDiff,5.5,0.104052
F1-Score,5.5,0.520108
Precision,5.5,0.566093
