# Importing and Cleaning Dataset

**Load Data**

In [1]:
import pandas as pd

def load_data():

    csv_path = "adult.csv"
    return pd.read_csv(csv_path)

data = load_data()
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


Initial analysis to show the percentage of women earning above $50K/year versus the numver of men

In [2]:
import numpy as np

data = data.replace('?',np.nan)

male_set = data[(data['gender'] == 'Male')].copy()
female_set = data[(data['gender'] == 'Female')].copy()

In [3]:
print(f"male {len(data[(data['gender'] == 'Male') & (data['income'] == '>50K')]) / len(data[(data['gender'] == 'Male')])}")
print(f"female {len(data[(data['gender'] == 'Female') & (data['income'] == '>50K')]) / len(data[(data['gender'] == 'Female')])}")

male 0.3037672281776417
female 0.10925148221343874


**Cleaning Data**

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
def clean_data(data):

  data = data.replace('?',np.nan)

  data = data.join(pd.get_dummies(data['workclass'], prefix='workclass'))
  data = data.drop(['workclass'], axis=1)

  data = data.drop(['education'], axis=1) #data already encoded by educational_num

  data['marital-status'] = data['marital-status'].replace(dict.fromkeys(['Never-married'], 'unmarried'))
  data['marital-status'] = data['marital-status'].replace(dict.fromkeys(['Married-civ-spouse', 'Married-AF-spouse'], 'married'))
  data['marital-status'] = data['marital-status'].replace(dict.fromkeys(['Separated', 'Married-spouse-absent'], 'separated'))
  data['marital-status'] = data['marital-status'].replace(dict.fromkeys(['Divorced'], 'divorced'))
  data['marital-status'] = data['marital-status'].replace(dict.fromkeys(['Widowed'], 'widowed'))
  data = data.join(pd.get_dummies(data['marital-status'], prefix='marital_status'))
  data = data.drop(['marital-status'], axis=1)

  data = data.join(pd.get_dummies(data['occupation'], prefix='occupation'))
  data = data.drop(['occupation'], axis=1)

  data = data.join(pd.get_dummies(data['relationship'], prefix='relationship'))
  data = data.drop(['relationship'], axis=1)

  data['race'] = data['race'].replace(dict.fromkeys(['Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'], 0))
  data['race'] = data['race'].replace(['White'], [1])

  data['gender'] = data['gender'].replace(['Female','Male'], [0,1])

  data['native-country'].loc[(data['native-country'] != 'United-States')]='ExPat'
  data['native-country'] = data['native-country'].replace(['United-States', 'ExPat'], [0,1])

  data['income'] = data['income'].replace(['<=50K', '>50K'], [0,1])

  return data

data = clean_data(data.copy())

data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,age,fnlwgt,educational-num,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,...,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife
0,25,226802,7,0,1,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,1,1,0,0,50,0,0,...,0,0,0,0,1,0,0,0,0,0
2,28,336951,12,1,1,0,0,40,0,1,...,1,0,0,0,1,0,0,0,0,0
3,44,160323,10,0,1,7688,0,40,0,1,...,0,0,0,0,1,0,0,0,0,0
4,18,103497,10,1,0,0,0,30,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
data.corrwith(data['income'])

age                             0.230369
fnlwgt                         -0.006339
educational-num                 0.332613
race                            0.083710
gender                          0.214628
capital-gain                    0.223013
capital-loss                    0.147554
hours-per-week                  0.227687
native-country                 -0.032551
income                          1.000000
workclass_Federal-gov           0.062112
workclass_Local-gov             0.034576
workclass_Never-worked         -0.008026
workclass_Private              -0.075625
workclass_Self-emp-inc          0.139596
workclass_Self-emp-not-inc      0.027190
workclass_State-gov             0.013619
workclass_Without-pay          -0.007002
marital_status_divorced        -0.128335
marital_status_married          0.446292
marital_status_separated       -0.083920
marital_status_unmarried       -0.318782
marital_status_widowed         -0.065050
occupation_Adm-clerical        -0.086475
occupation_Armed

**Defining Fairness Metrics**

*Disparate Impact*

In [7]:
def disparate_impact_scorer(y, y_pred, X):

  indexes = y.index.tolist()
  X = X.loc[indexes,:]
  y = y.tolist()
  y_pred = y_pred.tolist()
  
  X['y_pred'] = y_pred
 

  p1 = len(X[(X['y_pred']==1) & (X["gender"] == 0)]) / len(X)
  p2 = len(X[(X["gender"] == 0)]) / len(X)
  p3 = len(X[(X['y_pred']==1) & (X["gender"] == 1)]) / len(X)
  p4 = len(X[(X["gender"] == 1)]) / len(X)

  return min(((p1/p2)/(p3/p4)), ((p3/p4)/(p1/p2)))

*Zemel Fairness*

In [8]:
def zemel_fairness_scorer(y, y_pred, X):

  indexes = y.index.tolist()
  X = X.loc[indexes,:]
  y = y.tolist()
  y_pred = y_pred.tolist()

  X['y'] = y
  X['y_pred'] = y_pred


  p1 = len(X[(X['y_pred']==1) & (X["gender"] == 1)]) / len(X)
  p2 = len(X[(X["gender"] == 1)]) / len(X)

  p3 = len(X[(X['y_pred']==1) & (X["gender"] == 0)]) / len(X)
  p4 = len(X[(X["gender"] == 0)]) / len(X)


  return abs((p1/p2) - (p3/p4))

In [9]:
def BER_scorer(y, y_pred):

  tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

  return (0.5 * ((fn/(fn + tp))+(fp/(fp + tn))))  

# Naive Implementation

**Randomly Splitting Dataset**

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
data[data.columns.tolist()] = min_max_scaler.fit_transform(data[data.columns.tolist()])

train_set, test_set = train_test_split(data, test_size=0.3, random_state=42, stratify=data['gender'])

x_train = train_set.drop('income', axis=1)
x_test = test_set.drop('income', axis=1)

y_train = train_set['income']
y_test = test_set['income']

**Training Model**

In [11]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV

rfc = RandomForestClassifier(class_weight='balanced_subsample', random_state=42)
rfc.fit(x_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

**Evaluating Model Performance**

In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

y_pred = rfc.predict(x_test)

print(f"Test set accuracy = {accuracy_score(y_test, y_pred)}")
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"True pos {tp}")
print(f"False pos {fp}")
print(f"True neg {tn}")
print(f"False neg {fn}")
print(f"Sensitivity = {tp / (tp + fn)}")
print(f"Specificity = {tn / (tn + fp)}")
print(f"Area under ROC curve {roc_auc_score(y_test, y_pred)}")
print(f"Balanced Error Rate: {BER_scorer(y_test, y_pred)}")

Test set accuracy = 0.8571623558315703
True pos 2186
False pos 733
True neg 10374
False neg 1360
Sensitivity = 0.6164692611393119
Specificity = 0.9340055820653642
Area under ROC curve 0.775237421602338
Balanced Error Rate: 0.22476257839766195


In [13]:
print(f"Disparate Impact: {disparate_impact_scorer(y_test, y_pred, x_test)}")
print(f"Zemel Fairness: {zemel_fairness_scorer(y_test, y_pred, x_test)}")

Disparate Impact: 0.3127719191806302
Zemel Fairness: 0.1772971271292034


# Representative Implementation

**Resetting Dataset**

In [14]:
data = load_data()
data = clean_data(data.copy())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


**Upsampling minority class**

In [15]:
print(data['gender'].value_counts())

1    32650
0    16192
Name: gender, dtype: int64


In [16]:
from sklearn.utils import resample

majority = data[(data['gender'] == 1)]
minority = data[(data['gender'] == 0)]

resampled_minority= resample(minority, replace=True, n_samples=((509 * len(majority)) //491), random_state=42)


data = pd.concat([majority, resampled_minority])
data = data.reset_index(drop=True)

In [17]:
print(data['gender'].value_counts())

0    33846
1    32650
Name: gender, dtype: int64


In [18]:
data.corrwith(data['income'])

age                             0.212376
fnlwgt                         -0.001632
educational-num                 0.315436
race                            0.083104
gender                          0.242309
capital-gain                    0.230690
capital-loss                    0.144558
hours-per-week                  0.221948
native-country                 -0.027890
income                          1.000000
workclass_Federal-gov           0.060450
workclass_Local-gov             0.035645
workclass_Never-worked         -0.006518
workclass_Private              -0.076265
workclass_Self-emp-inc          0.138752
workclass_Self-emp-not-inc      0.036689
workclass_State-gov             0.008921
workclass_Without-pay          -0.008161
marital_status_divorced        -0.132460
marital_status_married          0.467119
marital_status_separated       -0.085524
marital_status_unmarried       -0.300064
marital_status_widowed         -0.068389
occupation_Adm-clerical        -0.092816
occupation_Armed

**Splitting Dataset using Stratifaction by Gender**

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
data[data.columns.tolist()] = min_max_scaler.fit_transform(data[data.columns.tolist()])

train_set, test_set = train_test_split(data, test_size=0.3, random_state=42, stratify=data['gender'])

x_train = train_set.drop('income', axis=1)
x_test = test_set.drop('income', axis=1)

y_train = train_set['income']
y_test = test_set['income']

**Training Model**

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rfc = RandomForestClassifier(class_weight='balanced_subsample', random_state=42)
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

**Evaluating Model Performance**

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

y_pred = rfc.predict(x_test)

print(f"Test set accuracy = {accuracy_score(y_test, y_pred)}")
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"True pos {tp}")
print(f"False pos {fp}")
print(f"True neg {tn}")
print(f"False neg {fn}")
print(f"Sensitivity = {tp / (tp + fn)}")
print(f"Specificity = {tn / (tn + fp)}")
print(f"Area under ROC curve {roc_auc_score(y_test, y_pred)}")
print(f"Balanced Error Rate: {BER_scorer(y_test, y_pred)}")

Test set accuracy = 0.9042558524236803
True pos 2830
False pos 683
True neg 15209
False neg 1227
Sensitivity = 0.6975597732314518
Specificity = 0.9570224012081551
Area under ROC curve 0.8272910872198034
Balanced Error Rate: 0.17270891278019657


**Evaluating Model Fairness**

In [22]:
print(f"Disparate Impact: {disparate_impact_scorer(y_test, y_pred, x_test)}")
print(f"Zemel Fairness: {zemel_fairness_scorer(y_test, y_pred, x_test)}")

Disparate Impact: 0.38386170914592554
Zemel Fairness: 0.1580761128041132


# Fair Implementation

**Resetting Dataset**

In [23]:
data = load_data()
data = clean_data(data.copy())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


**Applying Kamiran and Calders *Classification with No Discrimination***

In [24]:
train_probs = [max(l) for l in rfc.predict_proba(x_train)]

# reconstructing training set and adding probabilities and class columns
D = x_train.copy()
D['income'] = y_train.copy()
D['C'] = rfc.predict(x_train)
D['probs'] = train_probs

D = D.reset_index(drop=True)

# create ordered list of candidates for promotion
cp = D[(D['C'] == 0) & (D["gender"] == 0)].copy()
cp = cp.sort_values(by=['probs'], ascending=False)

# create ordered list of candidates for demotion
cd = D[(D['C'] == 1) & (D["gender"] == 1)].copy()
cd = cd.sort_values(by=['probs'], ascending=True)

# calculate number of swaps to minimise discrimination
def num_swaps(ji,jip, jj, jjp):

  return int(round(((ji*jjp) - (jj*jip)) / (ji+jj)))

n = num_swaps(len(D[(D["gender"] == 0)]), len(D[(D['C'] == 1) & (D["gender"] == 0)]), len(D[(D["gender"] == 1)]), len(D[(D['C'] == 1) & (D["gender"] == 1)]))

# swap classes of first n elements in CP
for idx, row in cp.head(n).iterrows():
        cp.loc[idx,'income'] = 1 - cp.loc[idx,'income']

# swap classes of first n elements in CD
for idx, row in cd.head(n).iterrows():
        cd.loc[idx,'income'] = 1 - cd.loc[idx,'income']

# update training set
D.iloc[cp.index,:]=cp
D.iloc[cd.index,:]=cd

# remove class and probability columns
D = D.drop(['C', 'probs'], axis = 1)

In [25]:
D.corrwith(D['income'])

age                             0.138521
fnlwgt                         -0.004325
educational-num                 0.285896
race                            0.045079
gender                          0.000288
capital-gain                    0.229198
capital-loss                    0.136194
hours-per-week                  0.121668
native-country                 -0.020469
workclass_Federal-gov           0.049465
workclass_Local-gov             0.034527
workclass_Never-worked         -0.006653
workclass_Private              -0.051710
workclass_Self-emp-inc          0.107601
workclass_Self-emp-not-inc     -0.012166
workclass_State-gov             0.016582
workclass_Without-pay          -0.008274
marital_status_divorced        -0.072510
marital_status_married          0.276352
marital_status_separated       -0.050585
marital_status_unmarried       -0.188151
marital_status_widowed         -0.027005
occupation_Adm-clerical        -0.019641
occupation_Armed-Forces        -0.002592
occupation_Craft

In [26]:
print(f"male {len(D[(D['gender'] == 1) & (D['income'] == 1)]) / len(D[(D['gender'] == 1)])}")
print(f"female {len(D[(D['gender'] == 0) & (D['income'] == 1)]) / len(D[(D['gender'] == 0)])}")

male 0.20490045941807045
female 0.20466824244470708


**Splitting Dataset using Stratifaction by Gender**

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
D[D.columns.tolist()] = min_max_scaler.fit_transform(D[D.columns.tolist()])

train_set = D

x_train = train_set.drop('income', axis=1)

y_train = train_set['income']

**Training Model**

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rfc = RandomForestClassifier(class_weight='balanced_subsample', random_state=42)
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

**Evaluating Model Performance**

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

y_pred = rfc.predict(x_test)

print(f"Test set accuracy = {accuracy_score(y_test, y_pred)}")
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"True pos {tp}")
print(f"False pos {fp}")
print(f"True neg {tn}")
print(f"False neg {fn}")
print(f"Sensitivity = {tp / (tp + fn)}")
print(f"Specificity = {tn / (tn + fp)}")
print(f"Area under ROC curve {roc_auc_score(y_test, y_pred)}")
print(f"Balanced Error Rate: {BER_scorer(y_test, y_pred)}")

Test set accuracy = 0.8622988620983508
True pos 2525
False pos 1215
True neg 14677
False neg 1532
Sensitivity = 0.6223810697559773
Specificity = 0.9235464384596023
Area under ROC curve 0.7729637541077897
Balanced Error Rate: 0.22703624589221016


**Evaluating Model Fairness**

In [30]:
print(f"Disparate Impact: {disparate_impact_scorer(y_test, y_pred, x_test)}")
print(f"Zemel Fairness: {zemel_fairness_scorer(y_test, y_pred, x_test)}")

Disparate Impact: 0.9212783355870043
Zemel Fairness: 0.015374634407561066
