In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from fairlearn.metrics import equalized_odds_difference, demographic_parity_difference
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt

In [3]:
# Read data and view a sample
loan_data = pd.read_csv("loan_data.csv")
loan_data

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,male,Associate,47971.0,6,RENT,15000.0,MEDICAL,15.66,0.31,3.0,645,No,1
44996,37.0,female,Associate,65800.0,17,RENT,9000.0,HOMEIMPROVEMENT,14.07,0.14,11.0,621,No,1
44997,33.0,male,Associate,56942.0,7,RENT,2771.0,DEBTCONSOLIDATION,10.02,0.05,10.0,668,No,1
44998,29.0,male,Bachelor,33164.0,4,RENT,12000.0,EDUCATION,13.23,0.36,6.0,604,No,1


In [4]:
# Check for null values; there are none, so we continue
loan_data.isnull().sum()

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

In [5]:
# We see that person_gender, person_education, person_home_ownership, loan_intent, and previous_loan_defaults_on_file are categorical
# Thus, we use LabelEncoders for these columns

gender_encoder = LabelEncoder()
education_encoder = LabelEncoder()
home_ownership_encoder = LabelEncoder()
intent_encoder = LabelEncoder()
previous_defaults_encoder = LabelEncoder()

loan_data["encoded_gender"] = gender_encoder.fit_transform(loan_data["person_gender"])
loan_data["encoded_education"] = education_encoder.fit_transform(loan_data["person_education"])
loan_data["encoded_home_ownership"] = home_ownership_encoder.fit_transform(loan_data["person_home_ownership"])
loan_data["encoded_intent"] = intent_encoder.fit_transform(loan_data["loan_intent"])
loan_data["encoded_previous_defaults"] = previous_defaults_encoder.fit_transform(loan_data["previous_loan_defaults_on_file"])

In [6]:
# Display the new dataframe columns
loan_data

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status,encoded_gender,encoded_education,encoded_home_ownership,encoded_intent,encoded_previous_defaults
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1,0,4,3,4,0
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0,0,3,2,1,1
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1,0,3,0,3,0
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1,0,1,3,3,0
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1,1,4,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,male,Associate,47971.0,6,RENT,15000.0,MEDICAL,15.66,0.31,3.0,645,No,1,1,0,3,3,0
44996,37.0,female,Associate,65800.0,17,RENT,9000.0,HOMEIMPROVEMENT,14.07,0.14,11.0,621,No,1,0,0,3,2,0
44997,33.0,male,Associate,56942.0,7,RENT,2771.0,DEBTCONSOLIDATION,10.02,0.05,10.0,668,No,1,1,0,3,0,0
44998,29.0,male,Bachelor,33164.0,4,RENT,12000.0,EDUCATION,13.23,0.36,6.0,604,No,1,1,1,3,1,0


In [7]:
# Now, we separate into X and y data, then into the train and test splits
X = loan_data[["person_age", "encoded_gender", "encoded_education", "person_income", "person_emp_exp", "encoded_home_ownership", "loan_amnt", "encoded_intent", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", "credit_score", "encoded_previous_defaults"]].values.reshape(-1,13) 
y = loan_data[["loan_status"]].values.reshape(-1,1)

# 70-30 random split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)

In [8]:
# Random forest classifier
classifier = RandomForestClassifier(criterion = "gini", min_samples_leaf = 1, min_samples_split = 5, max_features='sqrt', random_state=0)

In [9]:
# Fit the classifier
classifier.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [10]:
# Predict on the test set
y_pred = classifier.predict(X_test)

In [11]:
# Compute metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')

print("Model accuracy: " + str(accuracy))
print("Model precision: " + str(precision))
print("Model recall: " + str(recall))
print("Model F1 score: " + str(f1_score))

Model accuracy: 0.9277090585882527
Model precision: 0.9262126585627768
Model recall: 0.9277090585882527
Model F1 score: 0.9258870054311077


In [28]:
# Now, we compute fairness violations
# Here, our sensitive attribute is gender (second column)
eodds_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=X_test[:, 1])
dp_diff = demographic_parity_difference(y_test, y_pred, sensitive_features=X_test[:, 1])
print("Equalized odds difference: " + str(eodds_diff))
print("Demographic parity difference: " + str(dp_diff))

Equalized odds difference: 0.002226011099667139
Demographic parity difference: 0.0019039852755155795
