In [None]:
import pandas as pd
from data_loader import data_loader
from model_helper import compute_cost, get_tuned_gamma, train, tune_lambda, plot_lambda_tuning
import scipy.optimize as opt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
df = data_loader(num=1000)

In [None]:
# filter columns to only include columns in the features list below
features = ['loan_amount_000s', 'loan_type', 'owner_occupancy', 
       'property_type','applicant_income_000s', 'purchaser_type', 'hud_median_family_income',
       'tract_to_msamd_income', 'number_of_owner_occupied_units', 
       'number_of_1_to_4_family_units', 'race_ethnicity', 'state_code', 'county_code',
       'applicants_joined_sex',"minority_population", 'lien_status']

In [None]:
def preprocess(df):
    df['action_taken'].replace({'Approved': 1, 'Denied': 0}, inplace=True)
    # remove rows with "applicant_sex" values of 3, 4 or 5
    print(df.shape)
    df = df[df['applicant_sex'].isin([1, 2])] # men and female
    df = df[df['race_ethnicity'].isin([3, 5])] # black and white 
    df['applicant_sex'].replace({2: 0}, inplace=True) # men 1, women 0 
    df['race_ethnicity'].replace({3: 1, 5: 0}, inplace=True) # black 1, white 0
    print(df.shape)
    x_train, x_test, y_train, y_test = train_test_split(df, df['action_taken'], test_size=0.2, random_state=42)

    # convert df['applicant_sex'] and df['race_ethnicity'] to numpy arrays
    train_groups = np.column_stack([
        x_train['applicant_sex'].to_numpy(),
        x_train['race_ethnicity'].to_numpy()
    ])
    test_groups = np.column_stack([
        x_test['applicant_sex'].to_numpy(),
        x_test['race_ethnicity'].to_numpy()
    ])
    # filter columns to only include columns in the features list above
    print(x_train.shape)
    x_train = x_train[features]
    x_test = x_test[features]
    print(x_train.shape)

    # print shapes
    print("x_train shape: ", x_train.shape)
    print("y_train shape: ", x_test.shape)

    # Replace nan values with median value for that column 
    x_train = x_train.fillna(x_train.median())
    # drop rows with nan values in the test set
    x_test = x_test.fillna(x_train.median())
    # print number of nan values in each column
    #print(x_train.isna().sum())

    #standardize the data
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_train, y_test, train_groups, test_groups

x_train, x_test, y_train, y_test, train_groups, test_groups = preprocess(df)

In [None]:
betas = np.random.rand(x_train.shape[1])
gammas = np.linspace(0.1, 1, 10)
best_gamma = get_tuned_gamma(gammas, x_train, y_train, num_folds=5, verbose=False)

In [None]:
betas = np.random.rand(x_train.shape[1])
_lambda = None 
fair_loss_ = 'NO l2'

unfair_preds = train(x_train, y_train, x_test, y_test, train_groups, fair_loss_, best_gamma, lambda_val=1)  # do use l2 regularization here ergo False


In [None]:
betas = np.random.rand(x_train.shape[1])
_lambda = None 
fair_loss_ = False

unfair_preds = train(x_train, y_train, x_test, y_test, train_groups, best_gamma, fair_loss_) 

In [None]:
fair_loss_ = True
performance_metrics = tune_lambda(x_train, y_train, test_groups, train_groups, x_test, y_test, fair_loss_, best_gamma)

In [None]:
betas = np.random.rand(x_train.shape[1])
_lambda = None 
fair_loss_ = True

unfair_preds = train(x_train, y_train, x_test, y_test, train_groups, fair_loss_) 

In [None]:
# print shape of x_train, y_train, x_test, y_test, groups,
print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)
print("groups shape: ", groups.shape)


In [None]:
lambda_vals = [0.001, 0.005, 0.01, 0.05, 0.1, 1]
plot_lambda_tuning(performance_metrics, lambda_vals)
