In [1]:
import pandas as pd
import numpy as np

In [2]:
exp_name = "exp_compass"
exp_number = "exp_debug"
base_path = "/Users/andreasathanasopoulos/Phd/projects/bayesian_fairness/"
data_path = base_path + "/my_code/Bayesian-fairness/data"
save_path = base_path + f"/my_code/Bayesian-fairness/results/continuous/{exp_name}/{exp_number}"

# load data


In [3]:
def encode_data(data,unique_values):
    encoded_value = np.array([])
    for i, d in data.iterrows():
        # encode feature to an index represents the unique value.
        index = np.argmax((d.values == unique_values).all(axis=1))
        encoded_value = np.append(encoded_value, index)
    return encoded_value.astype(int)

In [4]:
# set atributes
Z_atr = ["sex", "race"]
X_atr = ['age_cat', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree']
Y_atr = 'two_year_recid'

# clip_features = ["juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count"]

In [5]:
dataset = pd.read_csv(data_path + "/compas.csv")

In [6]:
# get distinct values
unique_z = np.unique(dataset[Z_atr].values, axis=0)
n_z = len(unique_z)

unique_y = np.unique(dataset[Y_atr].values, axis=0)
n_y = len(unique_y)

unique_x = np.unique(dataset[X_atr].values, axis=0)
n_x = len(unique_x)

In [7]:
print("Unique Z values:", n_z)
print("Unique X values:", n_x)
print("Unique Y values:", n_y)

Unique Z values: 12
Unique X values: 604
Unique Y values: 2


In [8]:
# encode z for convenience
dataset["z"] = encode_data(dataset[Z_atr], unique_values=unique_z)

In [9]:
Z_atr = "z"

In [10]:
train_data = dataset.iloc[0:6000]
test_data = dataset.iloc[6000:]

In [11]:
print("training size:", train_data.shape)
print("testing size:", test_data.shape)

training size: (6000, 10)
testing size: (1214, 10)


# model

In [12]:
def calc_freq(data, n):
    """ calculate frequency of 1D array"""
    P = np.zeros(n)
    for value in range(n):
        mask =  data == value
        P[value] = mask.mean()
    return P

# calc Py

In [21]:
# calc Py
Py = calc_freq(test_data[Y_atr], n_y)
Py

array([0.54118616, 0.45881384])

# calc Pz_y

In [22]:

Pz_y = np.zeros((n_z, n_y))
for y in range(n_y):
    mask = test_data[Y_atr] == y
    Pz_y[:, y] = calc_freq(test_data.loc[mask, Z_atr], n_z)

In [23]:
Pz_y

array([[0.0456621 , 0.03052065],
       [0.34855403, 0.50448833],
       [0.29680365, 0.2064632 ],
       [0.08066971, 0.06642729],
       [0.00152207, 0.00359066],
       [0.        , 0.00179533],
       [0.01674277, 0.005386  ],
       [0.09589041, 0.1059246 ],
       [0.0913242 , 0.05924596],
       [0.02283105, 0.01436266],
       [0.        , 0.        ],
       [0.        , 0.00179533]])

# calc P(y | x)

In [61]:
# !pip install -U scikit-learn

In [62]:
# logistic regression to model P(y|x)

from sklearn.linear_model import LogisticRegression
model_y_x = LogisticRegression()
model_y_x.fit(X = test_data[X_atr], y = test_data[Y_atr])

# todo do cross validation for the hyperparameter
# todo check better model?

In [63]:
# some tests

# from sklearn.linear_model import LogisticRegression
# model_y_x = LogisticRegression().fit(X = train_data[X_atr], y = train_data[Y_atr])
# acc = model_y_x.score(test_data[X_atr],test_data[Y_atr])
# acc # not good acc on testset

# from sklearn.ensemble import RandomForestClassifier
# model_y_x = RandomForestClassifier()
# model_y_x.fit(X = train_data[X_atr], y = train_data[Y_atr])
# acc = model_y_x.score(test_data[X_atr],test_data[Y_atr])
# acc # not good acc on testset

# calc P(z | y, x)

In [85]:
# logistic regression to model P(z|x,y)

from sklearn.linear_model import LogisticRegression

input_features = X_atr+[Y_atr]
model_y_x = LogisticRegression(max_iter=1000)
model_y_x.fit(X = test_data[input_features], y = test_data[Z_atr])

# todo do cross validation for the hyperparameter
# todo check better model?

In [86]:
model_y_x.score(train_data[X_atr+[Y_atr]],train_data[Z_atr])

0.44966666666666666

In [87]:
# #some tests

# from sklearn.linear_model import LogisticRegression
# model_y_x = LogisticRegression().fit(X = train_data[input_features], y = train_data[Z_atr])
# acc = model_y_x.score(test_data[input_features],test_data[Z_atr])
# print("accuracy logistic regresion :", acc) # not good acc on testset

# from sklearn.ensemble import RandomForestClassifier
# model_y_x = RandomForestClassifier()
# model_y_x.fit(X = train_data[input_features], y = train_data[Z_atr])
# acc = model_y_x.score(test_data[input_features], test_data[Z_atr])
# print("accuracy RandomForest Classifier :", acc) # not good acc on testset