# Mortgage Lending Fair Feature Selection

In [1]:
# Using conda environment "fairfs" (Python 3.8)
import pandas as pd
import numpy as np
from tqdm import tqdm
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn import metrics, model_selection, pipeline, preprocessing
from sklearn import linear_model, naive_bayes, tree
from sklearn.ensemble import RandomForestClassifier

#import dataset_loader
#import fairfs
import unfairness_metrics

In [2]:
PROTECTED_COLUMN = 'derived_race'  # 'group' for simulated data, 'sex_Female' for adult, 'rural' for other datasets
ITERATIONS = 100
ACCURACY_METRIC = metrics.roc_auc_score

In [3]:
df = pd.read_csv('hmda_mortgage_derived_race_ffs.csv', header=0)
X = df.drop('action_taken', axis=1)
y = df['action_taken']

In [4]:
X.dtypes

income                               float64
debt_to_income_ratio                 float64
combined_loan_to_value_ratio         float64
loan_amount                            int64
derived_ethnicity                      int64
derived_race                           int64
derived_sex                            int64
applicant_age                          int64
preapproval                            int64
loan_type                              int64
loan_purpose                           int64
interest_only_payment                  int64
balloon_payment                        int64
loan_term                              int64
property_value                       float64
state_code                             int64
county_code                          float64
tract_minority_population_percent    float64
tract_population                       int64
tract_to_msa_income_percentage       float64
tract_median_age_of_housing_units      int64
ffiec_msa_md_median_family_income      int64
dtype: obj

In [5]:
def get_mortgage_data():
    # Simple simulated classification dataset with unfair and fair features
    df = pd.read_csv('hmda_mortgage_derived_race_ffs.csv', header=0)
    X = df.drop('action_taken', axis=1)
    y = df['action_taken']
    return {
        'mortgage_data': {
            'data': X.values,
            'labels': y.values,
            'participant_ids': np.arange(0, len(df)),
            'feature_names': np.array([f for f in df if f not in ['action_taken']])
        }
    }

In [6]:
def run_experiment(X, y, clf, protected_groups, unfairness_metric, unfairness_weight):
    metric = unfairness_metrics.UnfairnessMetric(protected_groups, unfairness_metric)
    unfairness_scorer = metrics.make_scorer(metric)
    unfairness_means = []
    auc_means = []
    selected_feature_props = np.zeros([ITERATIONS, X.shape[1]])
    for i in tqdm(range(ITERATIONS), desc=' Training ' + clf.__class__.__name__):
        xval = model_selection.KFold(4, shuffle=True, random_state=i)
        # Make a metric combining accuracy and subtracting unfairness w.r.t. the protected groups
        metric = unfairness_metrics.CombinedMetric(ACCURACY_METRIC, protected_groups,
                                                   unfairness_metric, unfairness_weight)
        combined_scorer = metrics.make_scorer(metric)
        sfs = SequentialFeatureSelector(clf, 'best', verbose=0, cv=xval, scoring=combined_scorer,
                                        n_jobs=2)
        pipe = pipeline.Pipeline([
            ('standardize', preprocessing.StandardScaler()),
            ('feature_selection', sfs),
            ('model', clf),
        ])
        result = model_selection.cross_validate(pipe, X, y, verbose=0, cv=xval, scoring={
            'unfairness': unfairness_scorer,
            'auc': metrics.make_scorer(ACCURACY_METRIC),
        }, return_estimator=True)
        unfairness_means.append(result['test_unfairness'].mean())
        auc_means.append(result['test_auc'].mean())
        for estimator in result['estimator']:
            for feature_i in estimator.named_steps['feature_selection'].k_feature_idx_:
                selected_feature_props[i][feature_i] += 1 / len(result['estimator'])
    return unfairness_means, auc_means, selected_feature_props

In [7]:
ds = get_mortgage_data()['mortgage_data']
print(ds.keys())  # data,

dict_keys(['data', 'labels', 'participant_ids', 'feature_names'])


In [8]:
ds

{'data': array([[1.6500e+02, 1.0000e+00, 6.3000e+01, ..., 9.0000e+01, 3.4000e+01,
         4.9800e+04],
        [1.1700e+02, 0.0000e+00, 7.9000e+01, ..., 1.0700e+02, 2.9000e+01,
         5.3400e+04],
        [9.3000e+01, 0.0000e+00, 8.7000e+01, ..., 7.4000e+01, 3.8000e+01,
         5.6100e+04],
        ...,
        [6.2000e+01, 4.0000e+00, 5.8214e+01, ..., 8.5000e+01, 3.8000e+01,
         7.2900e+04],
        [5.3000e+01, 3.0000e+00, 8.0000e+01, ..., 9.1000e+01, 6.2000e+01,
         6.6900e+04],
        [8.7000e+01, 4.0000e+00, 3.5556e+01, ..., 1.0600e+02, 4.9000e+01,
         7.5300e+04]]),
 'labels': array([1, 0, 1, ..., 1, 1, 0], dtype=int64),
 'participant_ids': array([    0,     1,     2, ..., 73944, 73945, 73946]),
 'feature_names': array(['income', 'debt_to_income_ratio', 'combined_loan_to_value_ratio',
        'loan_amount', 'derived_ethnicity', 'derived_race', 'derived_sex',
        'applicant_age', 'preapproval', 'loan_type', 'loan_purpose',
        'interest_only_payment', '

In [9]:
# Pick a column to use as the "protected" group labels
protected_col_index = np.nonzero(ds['feature_names'] == PROTECTED_COLUMN)[0][0]
protected_groups = pd.Series(ds['data'][:, protected_col_index])

In [10]:
protected_groups

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
73942    1.0
73943    1.0
73944    1.0
73945    1.0
73946    1.0
Length: 73947, dtype: float64

In [11]:
# Does the method reduce unfairness?
dfs = []
try:
    dfs.append(pd.read_csv('fairfs_results.csv'))
except FileNotFoundError:
    pass

In [None]:
 for m in [tree.DecisionTreeClassifier(max_depth=3,random_state=11798), naive_bayes.GaussianNB(), linear_model.LogisticRegression(random_state=11798)]:
    for unfairness_metric in unfairness_metrics.UNFAIRNESS_METRICS:
        for unfairness_weight in [0, 1, 2, 3, 4]:
            print('Training', m.__class__.__name__)
            print('Unfairness metric:', unfairness_metric)
            print('Unfairness metric weight:', unfairness_weight)
            if len(dfs) > 0 and sum((dfs[0].model == m.__class__.__name__) &
                                    (dfs[0].unfairness_metric == unfairness_metric) &
                                    (dfs[0].unfairness_weight == unfairness_weight)) > 0:
                print('Skipping (already done in output file)')
                continue
            unfairnesses, aucs, feature_selected_props = run_experiment(
                ds['data'], pd.Series(ds['labels']), m, protected_groups, unfairness_metric,
                unfairness_weight)
            dfs.append(pd.DataFrame({
                'model': [m.__class__.__name__] * len(aucs),
                'unfairness_metric': [unfairness_metric] * len(aucs),
                'unfairness_weight': [unfairness_weight] * len(aucs),
                'iteration': range(1, len(aucs) + 1),
                'unfairness': unfairnesses,
                'auc': aucs,
                'protected_column_selected_prop': feature_selected_props[:, protected_col_index],
            }))
            # What features does the model favor if it is optimizing for unfairness?
            if 'fair_feature' in ds['feature_names']:  # Synthetic data
                for col in ['fair_feature', 'unfair_feature']:
                    col_index = np.nonzero(ds['feature_names'] == col)[0][0]
                    dfs[-1][col + '_selected_prop'] = feature_selected_props[:, col_index]
            pd.concat(dfs).to_csv('fairfs_results_sp.csv', index=False)

Training DecisionTreeClassifier
Unfairness metric: statistical_parity
Unfairness metric weight: 0


 Training DecisionTreeClassifier:   3%|█▎                                          | 3/100 [19:21<10:27:28, 388.13s/it]