# Fair Bayesian Network Implementation

In this notebook, a fair bayesian network implementation is performed.

## Imports

In [7]:
import pandas as pd
import sys
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_curve
import matplotlib.pyplot as plt
from pgmpy.models import NaiveBayes
pd.set_option('display.float_format', lambda x: '%.5f' % x)
label_encoder = LabelEncoder()

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from forseti.datproc import translate_categorical
from forseti.bayesnet import latentLabelClassifier

## Clean Data and Train Bayesian Network

In [8]:
df = pd.read_csv('data/adult.csv')
sensitives = ['gender', 'race']
label = 'income'

train = df[:30000]
test = df[30000:]

clf = latentLabelClassifier(
    train,
    sensitives,
    label,
    atol=0.01
)

clf.load('trained-models/fair_model_30000.sav')

## Predict on new data

In [9]:
tmp, _ = translate_categorical(test);
y = tmp['income'];
tmp = tmp.drop('income', axis=1);

In [14]:
y_pred = clf.predict_probability(tmp)

# Replace NA with prior prediction
naindexes = y_pred.isna().any(axis=1)
y_pred[naindexes] = 0
y_pred.to_csv('results/y_pred_probability.csv')

## Save Predictions to File

In [None]:
y_pred_label_fair = (y_pred['fair_0'] >= 0.5).astype('int')
y_pred_label_fair.to_csv('results/y_pred_label_fair.csv')

y_pred_label_fair = (y_pred['income_1'] >= 0.5).astype('int')
y_pred_label_fair.to_csv('results/y_pred_label_income.csv')

# Naive Bayes with sensitive attributes

In [None]:
tmp_train, codes_train = translate_categorical(train.copy(deep=True))

from pgmpy.models import NaiveBayes

model = NaiveBayes()
model.fit(tmp_train, 'income')

## Predict and save predictions

In [None]:
y_pred_naive = model.predict(tmp)
y_pred_naive.to_csv('results/y_pred_label_naive_bayes_sensitive.csv')
y_pred_prob_naive = model.predict_probability(tmp)
y_pred_prob_naive.to_csv('results/y_pred_probability_naive_bayes_sensitive.csv')

100%|██████████| 10925/10925 [02:03<00:00, 88.56it/s] 
  return data.merge(data_with_results, how="left").loc[:, missing_variables]


# Naive Bayes without sensitive attributes

In [None]:
tmp_train, codes_train = translate_categorical(train.copy(deep=True))
tmp_train = tmp_train.drop(sensitives, axis=1)

from pgmpy.models import NaiveBayes

model = NaiveBayes()
model.fit(tmp_train, 'income')

## Predict and save predictions

In [None]:
tmp = tmp.drop(sensitives, axis=1)
y_pred_naive = model.predict(tmp)
y_pred_naive.to_csv('results/y_pred_label_naive_bayes.csv')
y_pred_prob_naive = model.predict_probability(tmp)
y_pred_prob_naive.to_csv('results/y_pred_probability_naive_bayes.csv')

100%|██████████| 9195/9195 [01:08<00:00, 134.10it/s]
  return data.merge(data_with_results, how="left").loc[:, missing_variables]


## Compas Dataset

In [60]:
scores = pd.read_csv('data/compas-two-yrs-recidivism.csv')
features = [
    'sex',
    'age',
    'race',
    'priors_count',
    'juv_fel_count',
    'juv_misd_count',
    'juv_other_count',
    'c_charge_degree',
    'two_year_recid'
]

dataset = scores[features]
dataset['two_year_recid'] = dataset['two_year_recid'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [61]:
tmp, _ = translate_categorical(dataset)
train = tmp[:6000]
label = 'two_year_recid'
sensitives = ['sex', 'race']
y = tmp[6000:][label]
test = tmp[6000:].drop(label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[obj] = dataframe[obj].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[num] = pd.cut(dataframe[num], 5, duplicates='drop')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[cat] = dataframe[cat].astype("category")
A value is trying to be set on a copy of 

In [62]:
clf = latentLabelClassifier(
    train,
    sensitives,
    label,
    atol=0.01
)

clf.fit()

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

## Predict

In [63]:
test_translate, _ = translate_categorical(test)
predict = clf.predict(test_translate)

  0%|          | 0/114 [00:00<?, ?it/s]

  phi.values = phi.values / phi.values.sum()
  phi.values = phi.values / phi.values.sum()


In [64]:
predictprob = clf.predict_probability(test_translate)
predictprob = predictprob.fillna(0)
predict.to_csv('results/pred_compas.csv')
predictprob.to_csv('results/pred_compas_probability.csv')

  phi.values = phi.values / phi.values.sum()


## Naive Bayes Compas with sensitive

In [58]:
tmp_train, codes_train = translate_categorical(train.copy(deep=True))

model = NaiveBayes()
model.fit(tmp_train, label)

y_pred_naive = model.predict(test)
y_pred_naive.to_csv('results/pred_compas_NB_sensitive.csv')
y_pred_prob_naive = model.predict_probability(test)
y_pred_prob_naive.to_csv('results/pred_compas_probability_NB_sensitive.csv')

  0%|          | 0/114 [00:00<?, ?it/s]

## Naive Bayes Compas Without Sensitive

In [59]:
train = train.drop(sensitives, axis=1)
test = test.drop(sensitives, axis=1)

model = NaiveBayes()
model.fit(train, label)

y_pred_naive = model.predict(test)
y_pred_naive.to_csv('results/pred_compas_NB.csv')
y_pred_prob_naive = model.predict_probability(test)
y_pred_prob_naive.to_csv('results/pred_compas_probability_NB.csv')

  0%|          | 0/35 [00:00<?, ?it/s]