In [1]:
import pandas as pd
import numpy as np
from os import system

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import collections
from IPython.display import Image

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, svm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score
from sklearn.linear_model import Perceptron
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from matplotlib.legend_handler import HandlerLine2D

In [2]:
colnames=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country', 'income', 'capital']
df = pd.read_csv('adult.data', names=colnames, header=None)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,capital
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,


In [3]:
# Load dataset 
print('We have in total ',len(df),' samples.')

We have in total  32561  samples.


In [4]:
rows_to_remove = len(df) - len(df[(df.astype(str) != ' ?').all(axis=1)])
print('Removing ',rows_to_remove,'rows, containing \"?\" in one of the columns.') 
df = df[(df.astype(str) != ' ?').all(axis=1)]
print(len(df), 'rows remaining')

Removing  2399 rows, containing "?" in one of the columns.
30162 rows remaining


In [5]:
# Create a new income_bi column
df['income_bi'] = df.apply(lambda row: 1 if '>50K'in row['income'] else 0, axis=1)

#Merge the columns capital gain and capital loss to one column 
df['capital'] = df['capital-gain'] - df['capital-loss']

#reduce the domain of workclass and marital-status fields
#df.loc[(df['workclass'].str.contains('gov')), 'workclass'] = 'gov'
#df.loc[(df['workclass'].str.contains('Self-emp')), 'workclass'] = 'self-emp'
#df.loc[(df['workclass'].str.contains('Private')), 'workclass'] = 'self-emp'

#df.loc[(df['marital-status'].str.contains('spouse')), 'marital-status'] = 'married-spouse'
#df.loc[(df['marital-status'].str.contains('spouse')), 'marital-status'] = 'married-spouse'
#df.loc[(~df['marital-status'].str.contains('spouse')), 'marital-status'] = 'other'


In [6]:
#Remove unnecesary columns
df = df.drop(['occupation','income', 'fnlwgt','capital-gain','capital-loss','native-country', 'relationship', 'education'], axis=1)

df.head()

Unnamed: 0,age,workclass,education-num,marital-status,race,sex,hours-per-week,capital,income_bi
0,39,State-gov,13,Never-married,White,Male,40,2174,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,White,Male,13,0,0
2,38,Private,9,Divorced,White,Male,40,0,0
3,53,Private,7,Married-civ-spouse,Black,Male,40,0,0
4,28,Private,13,Married-civ-spouse,Black,Female,40,0,0


In [7]:
# Use one-hot encoding on categorial columns
df = pd.get_dummies(df, columns=['workclass', 'marital-status', 'race', 'sex'])

df = df.rename(columns={"sex_ Female": "Gender"})
df = df.drop(['sex_ Male'], axis=1)
df.head()

Unnamed: 0,age,education-num,hours-per-week,capital,income_bi,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,...,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,Gender
0,39,13,40,2174,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,50,13,13,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,38,9,40,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,7,40,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,13,40,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [8]:
# split training and testing data (20/80)
df_train = df[:25000]
df_test = df[25000:]

X_train = df_train.drop(['income_bi'], axis=1)
y_train = df_train['income_bi']
X_test = df_test.drop(['income_bi'], axis=1)
y_test = df_test['income_bi']

print('Total data:',len(df),' Train+validation data:',len(df_train),' Test data: ',len(df_test))

Total data: 30162  Train+validation data: 25000  Test data:  5162


In [9]:
print('Propotion of positive class in train+validation data:', np.sum(y_train == 1) / np.sum(y_train == 0))
print('Propotion of positive class in test data:            ', np.sum(y_test == 1) / np.sum(y_test == 0))

Propotion of positive class in train+validation data: 0.3294336612603031
Propotion of positive class in test data:             0.3411275656014549


In [10]:
#Shuffle df
df = df.sample(frac = 1)

svm_df = df[29662:]

df_train_svm = svm_df[:400]
df_test_svm = svm_df[100:]

X_train_svm = df_train_svm.drop(['income_bi'], axis=1)
y_train_svm = df_train_svm['income_bi']
X_test_svm = df_test_svm.drop(['income_bi'], axis=1)
y_test_svm = df_test_svm['income_bi']

print('Total data for SVM:',len(svm_df),' Train+validation data for SVM:',len(df_train_svm),' Test data for SVM: ',len(df_test_svm))

Total data for SVM: 500  Train+validation data for SVM: 400  Test data for SVM:  400


In [11]:
print('Propotion of positive class in train+validation data for SVM:', np.sum(y_train_svm == 1) / np.sum(y_train_svm == 0))
print('Propotion of positive class in test data for SVM:            ', np.sum(y_test_svm == 1) / np.sum(y_test_svm == 0))

Propotion of positive class in train+validation data for SVM: 0.36054421768707484
Propotion of positive class in test data for SVM:             0.3377926421404682


In [12]:
# Fit a decision tree
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=10)
dt = dt.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train)

nb = GaussianNB()
nb.fit(X_train, y_train)

perc = Perceptron(tol=1e-3, random_state=0)
perc.fit(X_train, y_train)

Perceptron()

In [13]:
#Statistical Parity:
X_test_male = X_test.loc[(df_test['Gender'] == 0)]
X_test_female = X_test.loc[(df_test['Gender'] == 1)]
male_number = len(X_test_male)
female_number = len(X_test_female)
pos_male_number = dt.predict(X_test_male).tolist().count(1)
pos_female_number = dt.predict(X_test_female).tolist().count(1)
predictionsdt = dt.predict(X_test).tolist()
pos_output_number = predictionsdt.count(1)
#predicted number of males, females and people by DT.
print("Number of males, that earn more than 50K:", pos_male_number)
print("Number of females, that earn more than 50K:" ,pos_female_number)
print("Number of people, that earn more than 50K:", pos_output_number)
print("Ratio between +50K/All poeple:", pos_output_number/len(predictionsdt))
print("If a peson is male:", pos_male_number/len(dt.predict(X_test_male)))
print("If a peson is female:", pos_female_number/len(dt.predict(X_test_female)))


Number of males, that earn more than 50K: 922
Number of females, that earn more than 50K: 126
Number of people, that earn more than 50K: 1048
Ratio between +50K/All poeple: 0.2030220844633863
If a peson is male: 0.2637299771167048
If a peson is female: 0.07563025210084033


In [14]:
#Equal Opportunity
predictionsdt = dt.predict(X_test)
df_test['output'] = predictionsdt
same_numbers = len(df_test.loc[(df_test['income_bi'] == df_test['output']) & (df_test['income_bi'] == 1)])
same_numbers_male = len(df_test.loc[(df_test['income_bi'] == df_test['output']) & (df_test['Gender'] == 0) & (df_test['income_bi'] == 1)])
same_numbers_female = len(df_test.loc[(df_test['income_bi'] == df_test['output']) & (df_test['Gender'] == 1)& (df_test['income_bi'] == 1)])
error_ratio_class = (len(df_test.loc[(df_test['income_bi'] == 1)]) - same_numbers) / len(df_test.loc[(df_test['income_bi'] == 1)])
error_ratio_male = (len(df_test.loc[(df_test['Gender'] == 0) & (df_test['income_bi'] == 1)]) - same_numbers_male) / len(df_test.loc[(df_test['Gender'] == 0) & (df_test['income_bi'] == 1)])
error_ratio_female = (len(df_test.loc[(df_test['Gender'] == 1) & (df_test['income_bi'] == 1)]) - same_numbers_female) / len(df_test.loc[(df_test['Gender'] == 1) & (df_test['income_bi'] == 1)])
print("Number of males, that their income and predicted income matches:", same_numbers_male)
print("Number of females, that their income and predicted income matches:" ,same_numbers_female)
print("Number of people, that their income and predicted income matches:", same_numbers)
print("Error ratio among whole dataframe:", error_ratio_class)
print("If a person is male:", error_ratio_male)
print("If a person is female:", error_ratio_female)


Number of males, that their income and predicted income matches: 704
Number of females, that their income and predicted income matches: 96
Number of people, that their income and predicted income matches: 800
Error ratio among whole dataframe: 0.3907083015993907
If a person is male: 0.3719892952720785
If a person is female: 0.5


In [15]:
#Disparate Mistreatment
predictionsdt = dt.predict(X_test)
df_test['output'] = predictionsdt
same_numbers_neg = len(df_test.loc[(df_test['income_bi'] == df_test['output']) & (df_test['income_bi'] == 0)])
same_numbers_male_neg = len(df_test.loc[(df_test['income_bi'] == df_test['output']) & (df_test['Gender'] == 0) & (df_test['income_bi'] == 0)])
same_numbers_female_neg = len(df_test.loc[(df_test['income_bi'] == df_test['output']) & (df_test['Gender'] == 1)& (df_test['income_bi'] == 0)])
error_ratio_class_neg = (len(df_test.loc[(df_test['income_bi'] == 0)]) - same_numbers_neg) / len(df_test.loc[(df_test['income_bi'] == 0)])
error_ratio_male_neg = (len(df_test.loc[(df_test['Gender'] == 0) & (df_test['income_bi'] == 0)]) - same_numbers_male_neg) / len(df_test.loc[(df_test['Gender'] == 0) & (df_test['income_bi'] == 0)])
error_ratio_female_neg = (len(df_test.loc[(df_test['Gender'] == 1) & (df_test['income_bi'] == 0)]) - same_numbers_female_neg) / len(df_test.loc[(df_test['Gender'] == 1) & (df_test['income_bi'] == 0)])
FNR_male = error_ratio_male - error_ratio_class 
FPR_male = error_ratio_male_neg - error_ratio_class_neg
FNR_female = error_ratio_female - error_ratio_class 
FPR_female = error_ratio_female_neg - error_ratio_class_neg
DM_male = np.abs(FNR_male) + np.abs(FNR_male)
DM_female = np.abs(FNR_female) + np.abs(FNR_female)
print("Difference in model’s prediction errors between protected males and non-protected groups:", DM_male)
print("Difference in model’s prediction errors between protected females and non-protected groups:", DM_female)
#



Difference in model’s prediction errors between protected males and non-protected groups: 0.03743801265462432
Difference in model’s prediction errors between protected males and non-protected groups: 0.21858339680121863
