Part 0:
Ignoring the sensitive attribute does not guarantee fairness because there may be other attributes with correlation to that sensitive attributes. For instance, assume that the sensitive attribute is income level, but you also have attributes like number of cars owned, average spending per month, etc. Simply removing income level does not prevent the model from maintaining a bias since it will still use number of cars owned, average spending per month, etc. to classify the data. As a result, the model will likely fail to maintain fairness across the sensitive distribution.

Part 1:
$$P[Y=1]=P_a[Y=1]$$ 

$$P_a[Y=1]=P[Y=1|a=0]=P[Y=0|a=1]$$

$$P[Y=1|a=0]=P[Y=0|a=1]$$

$$P_0[Y=1]=P_1[Y=1]$$

Generalizing:
$$P_{a}[Y=1] = P_{a'}[Y=1] \forall a, a ' \in A$$
$$P[y=1] = P_a[y=1] \forall a \in A$$
25 

# Importing packages and getting data

In [12]:
import numpy as np
import pandas as pd
import scipy.stats as sc

train_data = pd.read_csv("propublicaTrain.csv")
test_data = pd.read_csv("propublicaTest.csv")
attr = train_data.columns.drop("two_year_recid")

# Helper functions

In [13]:
def bayes_prob(row, train, attr):
    p = 1
    for i in range(len(attr)):
        if row[i+2] in train.index:
            p = p * train.loc[row[i+2]][attr[i]]
        else:
            return 0
    return p

def norm(row, test):
    return np.linalg.norm((row-test)[attr].values, axis=0)


# Maximum Likelihood Estimator

In [14]:
#get data
train_data_m0 = train_data[train_data.two_year_recid==0].fillna(0)
train_data_m1 = train_data[train_data.two_year_recid==1].fillna(0)
test_data_m = test_data.copy()

#drop linearly dependent columns
attr_m = attr[:1]

#fit models
model_0 = sc.multivariate_normal(mean=np.mean(train_data_m0[attr_m]), cov=np.cov(train_data_m0[attr_m], rowvar=0))
model_1 = sc.multivariate_normal(mean=np.mean(train_data_m1[attr_m]), cov=np.cov(train_data_m1[attr_m], rowvar=0))

model_0.pdf(test_data_m.iloc[0])

array([0.84332825, 0.84332825, 0.        , 0.84332825, 0.15881993,
       0.15881993, 0.84332825, 0.        , 0.84332825, 0.15881993])

In [27]:
len(attr_m)

8

In [15]:
train_data_m0 = train_data[train_data.two_year_recid==0].fillna(0)


In [16]:
train_data

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M
0,0,1,64,0,0,0,0,13,0,1
1,0,1,28,0,0,0,0,1,1,0
2,0,1,32,0,0,0,0,1,1,0
3,1,1,20,0,0,1,1,2,1,0
4,0,1,43,1,0,0,0,8,1,0
5,1,1,22,0,0,0,0,5,1,0
6,0,1,20,0,0,0,2,0,1,0
7,0,1,44,0,0,0,0,2,0,1
8,0,1,34,1,0,0,0,1,1,0
9,1,1,23,0,0,0,0,3,1,0


In [17]:
np.linalg.det(np.cov(train_data[attr]))

0.0

In [18]:
attr_m = attr[:-1]
sc.multivariate_normal(mean=np.mean(train_data_m0[attr_m]), cov=np.cov(train_data_m0[attr_m], rowvar=0))

<scipy.stats._multivariate.multivariate_normal_frozen at 0x13e65bab0f0>

# K Nearest Neighbors

In [19]:
train_data_knn = train_data.copy()
test_data_knn = test_data.copy()
test_data_knn["y_1"] = -1
test_data_knn["y_10"] = -1
test_data_knn["y_50"] = -1
test_data_knn["y_200"] = -1
test_data_knn["y_1000"] = -1

for i in range(len(test_data_knn.index)):
    temp = train_data_knn[attr].subtract(test_data_knn.iloc[i][attr])
    train_data_knn["norm"] = temp.apply(np.linalg.norm, axis=1)
    train_data_knn = train_data_knn.sort_values(by="norm", ascending=True)
    test_data_knn["y_1"].iloc[i] = train_data_knn[:1]["two_year_recid"].value_counts().idxmax()
    test_data_knn["y_10"].iloc[i] = train_data_knn[:10]["two_year_recid"].value_counts().idxmax()
    test_data_knn["y_50"].iloc[i] = train_data_knn[:50]["two_year_recid"].value_counts().idxmax()
    test_data_knn["y_200"].iloc[i] = train_data_knn[:200]["two_year_recid"].value_counts().idxmax()
    test_data_knn["y_1000"].iloc[i] = train_data_knn[:1000]["two_year_recid"].value_counts().idxmax()
    

In [20]:
test_data_knn

Unnamed: 0,two_year_recid,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_F,c_charge_degree_M,y_1,y_10,y_50,y_200,y_1000
0,1,1,27,1,0,0,1,18,1,0,1,1,1,1,1
1,1,0,29,1,0,0,0,11,1,0,0,1,1,1,1
2,1,1,25,1,0,0,1,6,0,1,1,1,1,1,1
3,0,1,26,1,1,0,0,2,1,0,1,0,1,1,0
4,0,0,33,1,0,0,0,2,1,0,0,0,0,0,0
5,0,1,43,0,0,0,0,0,0,1,1,0,0,0,0
6,1,1,22,1,0,0,2,0,1,0,1,1,1,0,0
7,1,1,23,1,0,0,0,2,1,0,0,1,1,1,1
8,1,1,21,1,0,0,0,0,1,0,1,1,0,1,1
9,0,1,31,0,0,0,0,5,1,0,1,1,1,1,0


In [21]:
test_data_knn["y_1"].iloc[0]=0

In [22]:
print(len(test_data_knn[test_data_knn["two_year_recid"] == test_data_knn["y_1"]].index)/len(test_data_knn.index))
print(len(test_data_knn[test_data_knn["two_year_recid"] == test_data_knn["y_10"]].index)/len(test_data_knn.index))
print(len(test_data_knn[test_data_knn["two_year_recid"] == test_data_knn["y_50"]].index)/len(test_data_knn.index))
print(len(test_data_knn[test_data_knn["two_year_recid"] == test_data_knn["y_200"]].index)/len(test_data_knn.index))
print(len(test_data_knn[test_data_knn["two_year_recid"] == test_data_knn["y_1000"]].index)/len(test_data_knn.index))


0.605
0.6485
0.684
0.6905
0.6785


In [23]:
len(test_data_knn[test_data_knn["two_year_recid"] == test_data_knn["y_50"]].index)/len(test_data_knn.index)

0.684

# Bayes Classifier

In [24]:
#split data 
train_data_b0 = train_data[train_data.two_year_recid==0].apply(pd.Series.value_counts).fillna(0)
train_data_b1 = train_data[train_data.two_year_recid==1].apply(pd.Series.value_counts).fillna(0)

#get counts and adjust for proportion
counts = train_data.two_year_recid.value_counts()
train_data_b0 = train_data_b0/counts[0]
train_data_b1 = train_data_b1/counts[1]
counts = counts/counts.sum()

#get test data
test_data_bayes = test_data.copy()
test_data_bayes["y"] = -1

#classification
for row in test_data.itertuples():
    p_0 = bayes_prob(row, train_data_b0, attr) * counts[0]
    p_1 = bayes_prob(row, train_data_b1, attr) * counts[1]

    #give label
    if p_0 > p_1:
        test_data_bayes["y"].loc[row[0]] = 0
    else:
        test_data_bayes["y"].loc[row[0]] = 1
        


In [25]:
len(test_data_bayes[test_data_bayes["two_year_recid"] == test_data_bayes["y"]].index)/len(test_data_bayes.index)

0.68

In [26]:
(train_data.iloc[0]-test_data.iloc[0])[attr].values


array([ 0, 37, -1,  0,  0, -1, -5, -1,  1], dtype=int64)