# cGAN Generate Synthetic Data for German Dataset

CTGAN model is based on the GAN-based Deep Learning data synthesizer 

In [1]:
from implementation_functions import *

import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose

import pandas as pd
import numpy as np
from prince import FAMD #Factor analysis of mixed data
from aif360.metrics import BinaryLabelDatasetMetric
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import skfuzzy as fuzz

In [2]:
data_name = "german"
dataset_orig, privileged_groups, unprivileged_groups = aif_data(data_name, False)

In [3]:
sens_attr = ['age', 'sex']
decision_label = 'credit'
fav_l = 1
unfav_l = 0

In [4]:
orig_df, num_list, cat_list = preprocess(dataset_orig, sens_attr, decision_label)

# The list of sub-group sizes in the dataset (to monitor the dist. of sub-groups)
orig_df['sub_labels'].value_counts()

7    447
6    158
3    143
2     62
1     58
5     52
0     47
4     33
Name: sub_labels, dtype: int64

In [5]:
import time
start_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))

--- 3.504753112792969e-05 seconds ---


In [6]:
# Train-test split WITH stratification
X = orig_df.loc[:, orig_df.columns != decision_label]
y = orig_df.loc[:, orig_df.columns == decision_label].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, 
                                                    shuffle=True,
                                                    stratify=X['sub_labels'])

In [7]:
keep_sub_l = X_train['sub_labels']

In [8]:
X_train_new = X_train.drop(['age', 'sex', 'sub_labels'], axis=1)

In [9]:
X_train_new['sub_labels'] = keep_sub_l

In [10]:
X_train_new['class_labels'] = y_train

In [11]:
X_train_new

Unnamed: 0,month,credit_amount,investment_as_income_percentage,residence_since,number_of_credits,people_liable_for,status=A11,status=A12,status=A13,status=A14,...,skill_level=A171,skill_level=A172,skill_level=A173,skill_level=A174,telephone=A191,telephone=A192,foreign_worker=A201,foreign_worker=A202,sub_labels,class_labels
61,15.0,1537.0,4.0,4.0,2.0,1,0,1,0,0,...,0,0,1,0,0,1,1,0,7,1
252,30.0,2150.0,4.0,2.0,1.0,1,0,1,0,0,...,0,0,1,0,1,0,1,0,0,0
623,12.0,1858.0,4.0,1.0,1.0,1,1,0,0,0,...,0,0,1,0,1,0,1,0,1,1
447,7.0,2576.0,2.0,2.0,1.0,1,0,1,0,0,...,0,0,1,0,1,0,1,0,7,1
145,48.0,3566.0,4.0,2.0,1.0,1,0,1,0,0,...,0,0,1,0,1,0,1,0,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,24.0,11328.0,2.0,3.0,2.0,1,0,1,0,0,...,0,0,0,1,0,1,1,0,6,0
956,30.0,3656.0,4.0,4.0,2.0,1,0,0,1,0,...,0,1,0,0,1,0,1,0,7,1
739,30.0,4280.0,4.0,4.0,2.0,1,0,1,0,0,...,0,1,0,0,1,0,1,0,2,0
98,36.0,2337.0,4.0,4.0,1.0,1,0,1,0,0,...,0,0,1,0,1,0,1,0,7,1


# Here we start the GAN work

In [12]:
from sdv.tabular import CTGAN
model = CTGAN()

In [13]:
loaded = CTGAN.load('my_fariness_German_V3.pkl')

Generating X_train, the following cell shows 

In [14]:
#This cell is reposnsible about specifying the number of samples needed to be generated using the GAN, according to the sub_labels
available_rows = {}
for row_count in range(8):
    available_rows[row_count] = X_train["sub_labels"].value_counts()[row_count]
        
target_rows = max(available_rows.values())
max_label = max(available_rows, key=available_rows.get)
print(target_rows)
print(max_label)

313
7


In [15]:
main_df = pd.DataFrame()

In [16]:
#Start generating the new samples and add them to the main data frame by concatinating them
for key, value in available_rows.items():
    if int(key) != int(max_label):
        conditions = {
            "sub_labels" : int(key),
        }
        needed_rows = target_rows - value
        main_df = pd.concat([main_df, loaded.sample(needed_rows, conditions=conditions)])
        
print(len(main_df.index))

1804


In [17]:
main_df

Unnamed: 0,month,credit_amount,investment_as_income_percentage,residence_since,number_of_credits,people_liable_for,status=A11,status=A12,status=A13,status=A14,...,skill_level=A171,skill_level=A172,skill_level=A173,skill_level=A174,telephone=A191,telephone=A192,foreign_worker=A201,foreign_worker=A202,class_labels,sub_labels
0,20.0,2967.0,4.0,4.0,1.0,2,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
1,14.0,2454.0,4.0,3.0,2.0,2,0,1,0,0,...,0,0,1,0,1,1,1,0,1,0
2,33.0,2752.0,4.0,2.0,2.0,1,1,0,0,0,...,0,1,1,0,1,1,1,0,1,0
3,14.0,11494.0,4.0,4.0,1.0,1,0,1,0,1,...,0,0,1,0,0,0,1,0,1,0
4,18.0,18208.0,4.0,4.0,1.0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,30.0,1387.0,2.0,2.0,2.0,1,0,0,0,0,...,0,1,0,0,1,0,1,0,1,6
198,22.0,2413.0,2.0,4.0,2.0,1,1,1,0,0,...,0,0,0,0,1,0,1,0,0,6
199,42.0,1941.0,4.0,4.0,1.0,2,0,0,0,1,...,0,0,1,0,1,0,1,1,1,6
200,9.0,18424.0,4.0,1.0,2.0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,6


In [18]:
print(type(main_df))

<class 'pandas.core.frame.DataFrame'>


not to use any of the sensetive attributes ...trying to be blind by not using any of these obvious attributes ... we delete the sent att .. but still there is bias. Deleting them, will not remove the bias, but as a first step we are tryring to delete the traces of bias by deleteing sent and the techq

# Extreme Gradient Boosting Classifier

In [19]:
# Type the desired classifier to train the classification models with model obj
xgb = GradientBoostingClassifier()
baseline_stats, cm, ratio_table, preds = baseline_metrics(xgb, X_train, X_test, 
                                                  y_train, y_test, sens_attr, 
                                                  fav_l, unfav_l)

In [20]:
print(ratio_table)

                      Base Ratio  Positive Ratio  Negative Ratio
Index                                                           
{'age': 0, 'sex': 0}    0.103333        0.050000        0.053333
{'age': 1, 'sex': 0}    0.206667        0.143333        0.063333
{'age': 0, 'sex': 1}    0.086667        0.053333        0.033333
{'age': 1, 'sex': 1}    0.603333        0.466667        0.136667


In [21]:
test_sublabels = X_test['sub_labels']
X_test_n = X_test.drop(['age', 'sex','sub_labels'], axis=1)
num_list, cat_list = type_lists(X_test_n)

In [22]:
final_df = pd.concat([main_df, X_train_new])

In [23]:
final_df

Unnamed: 0,month,credit_amount,investment_as_income_percentage,residence_since,number_of_credits,people_liable_for,status=A11,status=A12,status=A13,status=A14,...,skill_level=A171,skill_level=A172,skill_level=A173,skill_level=A174,telephone=A191,telephone=A192,foreign_worker=A201,foreign_worker=A202,class_labels,sub_labels
0,20.0,2967.0,4.0,4.0,1.0,2,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
1,14.0,2454.0,4.0,3.0,2.0,2,0,1,0,0,...,0,0,1,0,1,1,1,0,1,0
2,33.0,2752.0,4.0,2.0,2.0,1,1,0,0,0,...,0,1,1,0,1,1,1,0,1,0
3,14.0,11494.0,4.0,4.0,1.0,1,0,1,0,1,...,0,0,1,0,0,0,1,0,1,0
4,18.0,18208.0,4.0,4.0,1.0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,24.0,11328.0,2.0,3.0,2.0,1,0,1,0,0,...,0,0,0,1,0,1,1,0,0,6
956,30.0,3656.0,4.0,4.0,2.0,1,0,0,1,0,...,0,1,0,0,1,0,1,0,1,7
739,30.0,4280.0,4.0,4.0,2.0,1,0,1,0,0,...,0,1,0,0,1,0,1,0,0,2
98,36.0,2337.0,4.0,4.0,1.0,1,0,1,0,0,...,0,0,1,0,1,0,1,0,1,7


In [24]:
final_df["sub_labels"].value_counts()

7    313
6    313
5    313
4    313
3    313
2    313
1    313
0    313
Name: sub_labels, dtype: int64

In [26]:
# Predicting the test sets using the extreme gradient boosting classifer 
X_test_pred_xgb= predict_whole_set(xgb, final_df, X_test_n)

In [27]:
metrics_table1, cm1, ratio_t1 = metrics_calculate(X_test, X_test_pred_xgb, y_test,
                                                  sens_attr, fav_l, unfav_l)

In [28]:
#outputs from strategy 1
print(metrics_table1)
print("Confusion Matrix:", cm1)
print(ratio_t1)

                                              AEO Difference  \
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 0}]        0.000532   
[{'age': 1, 'sex': 0}][{'age': 0, 'sex': 1}]       -0.037087   
[{'age': 0, 'sex': 1}][{'age': 1, 'sex': 1}]        0.014957   
[{'age': 0, 'sex': 0}][{'age': 0, 'sex': 1}]       -0.036555   
[{'age': 1, 'sex': 0}][{'age': 1, 'sex': 1}]       -0.022130   
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 1}]       -0.021597   

                                              Disparate Impact Ratio  \
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 0}]                1.000000   
[{'age': 1, 'sex': 0}][{'age': 0, 'sex': 1}]                0.943548   
[{'age': 0, 'sex': 1}][{'age': 1, 'sex': 1}]                1.012587   
[{'age': 0, 'sex': 0}][{'age': 0, 'sex': 1}]                0.943548   
[{'age': 1, 'sex': 0}][{'age': 1, 'sex': 1}]                0.955425   
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 1}]                0.955425   

                                              

# Random Forest Classifer

In [41]:
# Outputs using the XGBoost classifier 
RF= RandomForestClassifier() 
baseline_stats, cm, ratio_table, preds = baseline_metrics(RF, X_train, X_test, 
                                                  y_train, y_test, sens_attr, 
                                                  fav_l, unfav_l)

In [42]:
# Predicting the test sets based on strategy 1
X_test_pred_RF= predict_whole_set(RF, final_df, X_test_n)

In [43]:
metrics_table1, cm1, ratio_t1 = metrics_calculate(X_test, X_test_pred_RF, y_test,
                                                  sens_attr, fav_l, unfav_l)

In [44]:
#outputs from RF
print(metrics_table1)
print("Confusion Matrix:", cm1)
print(ratio_t1)

                                              AEO Difference  \
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 0}]       -0.019664   
[{'age': 1, 'sex': 0}][{'age': 0, 'sex': 1}]       -0.085641   
[{'age': 0, 'sex': 1}][{'age': 1, 'sex': 1}]        0.069337   
[{'age': 0, 'sex': 0}][{'age': 0, 'sex': 1}]       -0.105305   
[{'age': 1, 'sex': 0}][{'age': 1, 'sex': 1}]       -0.016304   
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 1}]       -0.035967   

                                              Disparate Impact Ratio  \
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 0}]                0.947368   
[{'age': 1, 'sex': 0}][{'age': 0, 'sex': 1}]                0.956129   
[{'age': 0, 'sex': 1}][{'age': 1, 'sex': 1}]                1.042146   
[{'age': 0, 'sex': 0}][{'age': 0, 'sex': 1}]                0.905806   
[{'age': 1, 'sex': 0}][{'age': 1, 'sex': 1}]                0.996427   
[{'age': 0, 'sex': 0}][{'age': 1, 'sex': 1}]                0.943983   

                                              