# cGAN Generate Synthetic Data for Compas Dataset

CTGAN model is based on the GAN-based Deep Learning data synthesizer 

In [1]:
from implementation_functions import *

import pandas as pd
import numpy as np

from prince import FAMD #Factor analysis of mixed data
from aif360.metrics import BinaryLabelDatasetMetric
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import skfuzzy as fuzz

In [2]:
data_name = "compas"
dataset_orig, privileged_groups, unprivileged_groups = aif_data(data_name, False)



In [3]:
#assign the sensitive attr and decision labels
sens_attr = ['race', 'sex']
decision_label = 'two_year_recid'
fav_l = 1
unfav_l = 0

In [4]:
orig_df, num_list, cat_list = preprocess(dataset_orig, sens_attr, decision_label)

In [5]:
#switch the dataset labels other way around for easier interpretation
orig_df['transf_labels'] = np.where(orig_df['two_year_recid']== 0, 1, 0)
decision_label = 'transf_labels'
orig_df = orig_df.drop('two_year_recid', axis=1)

orig_df, num_list, cat_list = preprocess(orig_df, sens_attr, decision_label)
orig_df['sub_labels'].value_counts()

0    1458
1    1166
3     968
2     652
5     346
7     310
4     203
6     170
Name: sub_labels, dtype: int64

In [6]:
import time
start_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))

--- 9.1552734375e-05 seconds ---


In [7]:
# Train-test split WITH stratification
X = orig_df.loc[:, orig_df.columns != decision_label]
y = orig_df.loc[:, orig_df.columns == decision_label].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, 
                                                    shuffle=True,
                                                    stratify=X['sub_labels'])

In [8]:
keep_sub_l = X_train['sub_labels']

In [9]:
X_train_new = X_train.drop(['age', 'sex', 'sub_labels'], axis=1)

In [10]:
X_train_new['sub_labels'] = keep_sub_l

In [11]:
X_train_new['class_labels'] = y_train

In [12]:
X_train_new

Unnamed: 0,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,c_charge_degree=F,c_charge_degree=M,...,c_charge_desc=Viol Injunction Protect Dom Vi,c_charge_desc=Viol Pretrial Release Dom Viol,c_charge_desc=Viol Prot Injunc Repeat Viol,c_charge_desc=Violation License Restrictions,c_charge_desc=Violation Of Boater Safety Id,c_charge_desc=Violation of Injunction Order/Stalking/Cyberstalking,c_charge_desc=Voyeurism,c_charge_desc=arrest case no charge,sub_labels,class_labels
6161,0,0.0,0.0,0.0,6.0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
9907,0,0.0,0.0,0.0,7.0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4825,0,0.0,0.0,0.0,0.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9515,0,0.0,0.0,0.0,1.0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
10758,0,0.0,0.0,0.0,4.0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,0,0.0,0.0,0.0,14.0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5729,1,0.0,1.0,0.0,4.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,2,0
2516,0,0.0,0.0,0.0,1.0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,1
3331,0,0.0,0.0,0.0,3.0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Here we start the GAN work

In [13]:
from sdv.tabular import CTGAN
model = CTGAN()

In [14]:
loaded = CTGAN.load('my_fariness_Compas_V3.pkl')

In [15]:
# print(X_train.loc[:,"sub_labels"])
available_rows = {}
for row_count in range(8):
    available_rows[row_count] = X_train["sub_labels"].value_counts()[row_count]
        
target_rows = max(available_rows.values())
max_label = max(available_rows, key=available_rows.get)
print(target_rows)
print(max_label)
# print(X_train["sub_labels"].value_counts())


1021
0


In [16]:
main_df = pd.DataFrame()

In [17]:
for key, value in available_rows.items():
    if int(key) != int(max_label):
        conditions = {
            "sub_labels" : int(key),
        }
        needed_rows = target_rows - value
        main_df = pd.concat([main_df, loaded.sample(needed_rows, conditions=conditions)])
        
print(len(main_df.index))

4477


# Extreme Gradient Boosting Classifier

In [18]:
# Type the desired classifier to train the classification models with model obj
xgb= GradientBoostingClassifier()
baseline_stats, cm, ratio_table, preds = baseline_metrics(xgb, X_train, X_test, 
                                                  y_train, y_test, sens_attr, 
                                                  fav_l, unfav_l)

In [19]:
test_sublabels = X_test['sub_labels']
X_test_n = X_test.drop(['race', 'sex','sub_labels'], axis=1)
num_list, cat_list = type_lists(X_test_n)

In [20]:
final_df = pd.concat([main_df, X_train_new])

In [21]:
final_df 

Unnamed: 0,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,c_charge_degree=F,c_charge_degree=M,...,c_charge_desc=Viol Injunction Protect Dom Vi,c_charge_desc=Viol Pretrial Release Dom Viol,c_charge_desc=Viol Prot Injunc Repeat Viol,c_charge_desc=Violation License Restrictions,c_charge_desc=Violation Of Boater Safety Id,c_charge_desc=Violation of Injunction Order/Stalking/Cyberstalking,c_charge_desc=Voyeurism,c_charge_desc=arrest case no charge,sub_labels,class_labels
0,1,0.0,0.0,0.0,0.0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0.0,0.0,0.0,4.0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,1
2,1,0.0,0.0,0.0,2.0,1,0,0,1,1,...,0,0,0,0,0,0,0,1,1,1
3,0,0.0,0.0,0.0,4.0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,1
4,0,0.0,0.0,1.0,6.0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,0,0.0,0.0,0.0,14.0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5729,1,0.0,1.0,0.0,4.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,2,0
2516,0,0.0,0.0,0.0,1.0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,1
3331,0,0.0,0.0,0.0,3.0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
final_df['sub_labels'].value_counts()

7    1021
6    1021
5    1021
4    1021
3    1021
2    1021
1    1021
0    1021
Name: sub_labels, dtype: int64

In [23]:
print(ratio_table)

                       Base Ratio  Positive Ratio  Negative Ratio
Index                                                            
{'race': 0, 'sex': 0}    0.497472        0.204804        0.292668
{'race': 1, 'sex': 0}    0.307206        0.213021        0.094185
{'race': 0, 'sex': 1}    0.104298        0.067636        0.036662
{'race': 1, 'sex': 1}    0.091024        0.069532        0.021492


In [24]:
# Predicting the test sets using the extreme gradient boosting 
X_test_pred_xgb = predict_whole_set(xgb, final_df, X_test_n)

In [25]:
metrics_table1, cm1, ratio_t1 = metrics_calculate(X_test, X_test_pred_xgb, y_test,
                                                  sens_attr, fav_l, unfav_l)


In [26]:
#outputs from strategy 1
print(metrics_table1)
print("Confusion Matrix:", cm1)
print(ratio_t1)

                                                AEO Difference  \
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 0}]       -0.119925   
[{'race': 1, 'sex': 0}][{'race': 0, 'sex': 1}]       -0.034355   
[{'race': 0, 'sex': 1}][{'race': 1, 'sex': 1}]        0.007313   
[{'race': 0, 'sex': 0}][{'race': 0, 'sex': 1}]       -0.154280   
[{'race': 1, 'sex': 0}][{'race': 1, 'sex': 1}]       -0.027042   
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 1}]       -0.146967   

                                                Disparate Impact Ratio  \
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 0}]                0.824351   
[{'race': 1, 'sex': 0}][{'race': 0, 'sex': 1}]                0.972639   
[{'race': 0, 'sex': 1}][{'race': 1, 'sex': 1}]                0.993566   
[{'race': 0, 'sex': 0}][{'race': 0, 'sex': 1}]                0.801796   
[{'race': 1, 'sex': 0}][{'race': 1, 'sex': 1}]                0.966382   
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 1}]                0.796638   

                  

# Random Forest Classifer

In [29]:
# Type the desired classifier to train the classification models with model obj
RF= GradientBoostingClassifier()
baseline_stats, cm, ratio_table, preds = baseline_metrics(xgb, X_train, X_test, 
                                                  y_train, y_test, sens_attr, 
                                                  fav_l, unfav_l)

In [32]:
print(ratio_table)

                       Base Ratio  Positive Ratio  Negative Ratio
Index                                                            
{'race': 0, 'sex': 0}    0.497472        0.205436        0.292035
{'race': 1, 'sex': 0}    0.307206        0.211125        0.096081
{'race': 0, 'sex': 1}    0.104298        0.067636        0.036662
{'race': 1, 'sex': 1}    0.091024        0.069532        0.021492


In [31]:
# Predicting the test sets using the extreme gradient boosting 
X_test_pred_RF = predict_whole_set(RF, final_df, X_test_n)

In [33]:
metrics_table1, cm1, ratio_t1 = metrics_calculate(X_test, X_test_pred_RF, y_test,
                                                  sens_attr, fav_l, unfav_l)

In [34]:
#outputs from strategy 1
print(metrics_table1)
print("Confusion Matrix:", cm1)
print(ratio_t1)

                                                AEO Difference  \
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 0}]       -0.119925   
[{'race': 1, 'sex': 0}][{'race': 0, 'sex': 1}]       -0.034355   
[{'race': 0, 'sex': 1}][{'race': 1, 'sex': 1}]        0.007313   
[{'race': 0, 'sex': 0}][{'race': 0, 'sex': 1}]       -0.154280   
[{'race': 1, 'sex': 0}][{'race': 1, 'sex': 1}]       -0.027042   
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 1}]       -0.146967   

                                                Disparate Impact Ratio  \
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 0}]                0.824351   
[{'race': 1, 'sex': 0}][{'race': 0, 'sex': 1}]                0.972639   
[{'race': 0, 'sex': 1}][{'race': 1, 'sex': 1}]                0.993566   
[{'race': 0, 'sex': 0}][{'race': 0, 'sex': 1}]                0.801796   
[{'race': 1, 'sex': 0}][{'race': 1, 'sex': 1}]                0.966382   
[{'race': 0, 'sex': 0}][{'race': 1, 'sex': 1}]                0.796638   

                  