In [1]:
import aif360
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'


In [2]:
# protected_attributes = ['age', 'gender', 'bmi']
# metrics = ['Error Rate Difference (ERD)', 'False Positive Rate Difference (FPRD)', 'False Discovery Rate Difference (FDRD)', 'Positive Predicted Value Difference (PPVD)']
protected_attributes = ['age', 'gender', 'bmi']
metrics = ['Error Rate Difference (ERD)', 'False Omission Rate Difference (FORD)', 'Statistical Parity Difference (SPD)', 'Negative Predicted Value Difference (NPVD)']

In [3]:
# create the table to be printed
df_print = pd.DataFrame(columns=protected_attributes, index=[metrics])

## Generic Model

In [4]:
from sklearn.preprocessing import LabelEncoder

# Assuming your dataframe is named df
df = pd.read_csv('Output_Files/LifeSnaps_Generic_Model.csv')

# Instantiate LabelEncoder
label_encoder = LabelEncoder()

# Encode 'age' column
df['age'] = label_encoder.fit_transform(df['age'])

# Encode 'gender' column
df['gender'] = label_encoder.fit_transform(df['gender'])

# Display the modified dataframe
print(df)

                            id  y_true  y_pred  age  gender  bmi
0     621e2e8e67b776a24055b564     0.0       0    0       1    0
1     621e2e8e67b776a24055b564     0.0       0    0       1    0
2     621e2e8e67b776a24055b564     0.0       0    0       1    0
3     621e2e8e67b776a24055b564     0.0       1    0       1    0
4     621e2e8e67b776a24055b564     0.0       0    0       1    0
...                        ...     ...     ...  ...     ...  ...
2266  621e30e267b776a240e5bf90     0.0       0    0       1    3
2267  621e30e267b776a240e5bf90     0.0       0    0       1    3
2268  621e30e267b776a240e5bf90     0.0       0    0       1    3
2269  621e30e267b776a240e5bf90     0.0       0    0       1    3
2270  621e30e267b776a240e5bf90     0.0       0    0       1    3

[2271 rows x 6 columns]


In [5]:
for protected_attribute in protected_attributes:
    # prepare the dataset in the format that aif360 requires
    ytrue_df = df['y_true'].to_frame()
    ytrue_df.rename(columns={0: 'y_true'}, inplace=True)
    attribute_df = df[protected_attribute].to_frame()
    attribute_df.rename(columns={0: protected_attribute}, inplace=True)
    attribute_df = pd.concat([attribute_df, ytrue_df], axis=1)

    # create the aif360 dataset with the ground truth
    dataset = BinaryLabelDataset(
        favorable_label=1,
        unfavorable_label=0,
        df=attribute_df,
        label_names=['y_true'],
        protected_attribute_names=[protected_attribute]
    )

    # create the aif360 dataset with the predictions
    dataset_pred = dataset.copy()
    dataset_pred.labels = df['y_pred']

    # creates an object that computes metrics for binary classification
    index = dataset_pred.protected_attribute_names.index(protected_attribute)
    privileged_groups = [{protected_attribute: dataset_pred.privileged_protected_attributes[index][0]}]
    unprivileged_groups = [{protected_attribute: dataset_pred.unprivileged_protected_attributes[index][0]}]
    classified_metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    # fill in the metrics
    df_print[protected_attribute] = [classified_metric.error_rate_difference(), 
                                     classified_metric.false_omission_rate_difference(), 
                                     classified_metric.statistical_parity_difference(), 
                                     classified_metric.negative_predictive_value(privileged=False) - classified_metric.negative_predictive_value(privileged=True)]
df_print

Unnamed: 0,age,gender,bmi
Error Rate Difference (ERD),-0.05657,-0.044389,-0.017106
False Omission Rate Difference (FORD),-0.013629,0.01797,0.024426
Statistical Parity Difference (SPD),-0.085896,-0.070938,-0.04253
Negative Predicted Value Difference (NPVD),0.013629,-0.01797,-0.024426


In [6]:
for protected_attribute in protected_attributes:
    # prepare the dataset in the format that aif360 requires
    ytrue_df = df['y_true'].to_frame()
    ytrue_df.rename(columns={0: 'y_true'}, inplace=True)
    attribute_df = df[protected_attribute].to_frame()
    attribute_df.rename(columns={0: protected_attribute}, inplace=True)
    attribute_df = pd.concat([attribute_df, ytrue_df], axis=1)

    # create the aif360 dataset with the ground truth
    dataset = BinaryLabelDataset(
        favorable_label=1,
        unfavorable_label=0,
        df=attribute_df,
        label_names=['y_true'],
        protected_attribute_names=[protected_attribute]
    )

    # create the aif360 dataset with the predictions
    dataset_pred = dataset.copy()
    dataset_pred.labels = df['y_pred']

    # creates an object that computes metrics for binary classification
    index = dataset_pred.protected_attribute_names.index(protected_attribute)
    privileged_groups = [{protected_attribute: dataset_pred.privileged_protected_attributes[index][0]}]
    unprivileged_groups = [{protected_attribute: dataset_pred.unprivileged_protected_attributes[index][0]}]
    classified_metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    # Print the privileged and unprivileged groups
    print(f"Protected attribute: {protected_attribute}")
    print(f"Privileged groups: {privileged_groups}")
    print(f"Unprivileged groups: {unprivileged_groups}")

    # fill in the metrics
    df_print[protected_attribute] = [classified_metric.error_rate_difference(), 
                                     classified_metric.false_omission_rate_difference(), 
                                     classified_metric.statistical_parity_difference(), 
                                     classified_metric.negative_predictive_value(privileged=False) - classified_metric.negative_predictive_value(privileged=True)]
df_print

Protected attribute: age
Privileged groups: [{'age': 1.0}]
Unprivileged groups: [{'age': 0.0}]
Protected attribute: gender
Privileged groups: [{'gender': 1.0}]
Unprivileged groups: [{'gender': 0.0}]
Protected attribute: bmi
Privileged groups: [{'bmi': 3.0}]
Unprivileged groups: [{'bmi': 0.0}]


Unnamed: 0,age,gender,bmi
Error Rate Difference (ERD),-0.05657,-0.044389,-0.017106
False Omission Rate Difference (FORD),-0.013629,0.01797,0.024426
Statistical Parity Difference (SPD),-0.085896,-0.070938,-0.04253
Negative Predicted Value Difference (NPVD),0.013629,-0.01797,-0.024426


## Single Attribute Splitting

In [6]:
df = pd.read_csv('Output_Files/LifeSnaps_Single_Attribute_Splitting.csv')
df

Unnamed: 0,id,y_true,y_pred,age,gender,bmi
0,621e34ec67b776a240d60873,0,0.0,0.0,0.0,1
1,621e328667b776a240281372,0,0.0,1.0,0.0,3
2,621e36c267b776a240ba2756,0,0.0,0.0,0.0,1
3,621e34ec67b776a240d60873,0,0.0,0.0,0.0,1
4,621e34ec67b776a240d60873,0,0.0,0.0,0.0,1
...,...,...,...,...,...,...
1864,621e30e467b776a240e817c7,0,0.0,0.0,0.0,3
1865,621e30e467b776a240e817c7,1,0.0,0.0,0.0,3
1866,621e30e467b776a240e817c7,0,0.0,0.0,0.0,3
1867,621e30e467b776a240e817c7,1,0.0,0.0,0.0,3


In [7]:
# Check for NaN values
nan_values = df.isna().sum()

# Display the NaN values count
print(nan_values)

id          0
y_true      0
y_pred    119
age         2
gender      2
bmi         0
dtype: int64


In [8]:
# Drop rows with NaN values in 'age' or 'gender' columns
df.dropna(subset=['age', 'gender'], inplace=True)

# Display the modified dataframe
print(df)

                            id  y_true  y_pred  age  gender  bmi
0     621e34ec67b776a240d60873       0     0.0  0.0     0.0    1
1     621e328667b776a240281372       0     0.0  1.0     0.0    3
2     621e36c267b776a240ba2756       0     0.0  0.0     0.0    1
3     621e34ec67b776a240d60873       0     0.0  0.0     0.0    1
4     621e34ec67b776a240d60873       0     0.0  0.0     0.0    1
...                        ...     ...     ...  ...     ...  ...
1864  621e30e467b776a240e817c7       0     0.0  0.0     0.0    3
1865  621e30e467b776a240e817c7       1     0.0  0.0     0.0    3
1866  621e30e467b776a240e817c7       0     0.0  0.0     0.0    3
1867  621e30e467b776a240e817c7       1     0.0  0.0     0.0    3
1868  621e30e467b776a240e817c7       0     0.0  0.0     0.0    3

[1867 rows x 6 columns]


In [9]:
for protected_attribute in protected_attributes:
    # prepare the dataset in the format that aif360 requires
    ytrue_df = df['y_true'].to_frame()
    ytrue_df.rename(columns={0: 'y_true'}, inplace=True)
    attribute_df = df[protected_attribute].to_frame()
    attribute_df.rename(columns={0: protected_attribute}, inplace=True)
    attribute_df = pd.concat([attribute_df, ytrue_df], axis=1)

    # create the aif360 dataset with the ground truth
    dataset = BinaryLabelDataset(
        favorable_label=1,
        unfavorable_label=0,
        df=attribute_df,
        label_names=['y_true'],
        protected_attribute_names=[protected_attribute]
    )

    # create the aif360 dataset with the predictions
    dataset_pred = dataset.copy()
    dataset_pred.labels = df['y_pred']

    # creates an object that computes metrics for binary classification
    index = dataset_pred.protected_attribute_names.index(protected_attribute)
    privileged_groups = [{protected_attribute: dataset_pred.privileged_protected_attributes[index][0]}]
    unprivileged_groups = [{protected_attribute: dataset_pred.unprivileged_protected_attributes[index][0]}]
    classified_metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    # fill in the metrics
    df_print[protected_attribute] = [classified_metric.error_rate_difference(), 
                                     classified_metric.false_omission_rate_difference(), 
                                     classified_metric.statistical_parity_difference(), 
                                     classified_metric.negative_predictive_value(privileged=False) - classified_metric.negative_predictive_value(privileged=True)]
df_print

Unnamed: 0,age,gender,bmi
Error Rate Difference (ERD),0.160422,-0.071209,0.002979
False Omission Rate Difference (FORD),-0.042148,-0.038495,-0.073856
Statistical Parity Difference (SPD),0.053589,0.091709,0.100629
Negative Predicted Value Difference (NPVD),0.042148,0.038495,0.073856


## Multi Attribute Splitting

In [10]:
df = pd.read_csv('Output_Files/LifeSnaps_Multi_Attribute_Splitting.csv')
df

Unnamed: 0,id,y_true,y_pred,age,gender,bmi
0,621e312a67b776a240164d59,0,0.0,1.0,0.0,2
1,621e30f467b776a240f22944,0,0.0,0.0,0.0,1
2,621e339967b776a240e502de,0,0.0,0.0,1.0,1
3,621e312a67b776a240164d59,1,0.0,1.0,0.0,2
4,621e312a67b776a240164d59,1,1.0,1.0,0.0,2
...,...,...,...,...,...,...
1283,621e30e267b776a240e5bf90,0,,0.0,0.0,3
1284,621e30e267b776a240e5bf90,1,,0.0,0.0,3
1285,621e30e267b776a240e5bf90,0,,0.0,0.0,3
1286,621e30e267b776a240e5bf90,0,,0.0,0.0,3


In [11]:
# Check for NaN values
nan_values = df.isna().sum()

# Display the NaN values count
print(nan_values)

id          0
y_true      0
y_pred     27
age       132
gender     80
bmi         0
dtype: int64


In [12]:
# Drop rows with NaN values in 'age' or 'gender' columns
df.dropna(subset=['age', 'gender'], inplace=True)

# Display the modified dataframe
print(df)

                            id  y_true  y_pred  age  gender  bmi
0     621e312a67b776a240164d59       0     0.0  1.0     0.0    2
1     621e30f467b776a240f22944       0     0.0  0.0     0.0    1
2     621e339967b776a240e502de       0     0.0  0.0     1.0    1
3     621e312a67b776a240164d59       1     0.0  1.0     0.0    2
4     621e312a67b776a240164d59       1     1.0  1.0     0.0    2
...                        ...     ...     ...  ...     ...  ...
1283  621e30e267b776a240e5bf90       0     NaN  0.0     0.0    3
1284  621e30e267b776a240e5bf90       1     NaN  0.0     0.0    3
1285  621e30e267b776a240e5bf90       0     NaN  0.0     0.0    3
1286  621e30e267b776a240e5bf90       0     NaN  0.0     0.0    3
1287  621e30e267b776a240e5bf90       0     NaN  0.0     0.0    3

[1156 rows x 6 columns]


In [13]:
for protected_attribute in protected_attributes:
    # prepare the dataset in the format that aif360 requires
    ytrue_df = df['y_true'].to_frame()
    ytrue_df.rename(columns={0: 'y_true'}, inplace=True)
    attribute_df = df[protected_attribute].to_frame()
    attribute_df.rename(columns={0: protected_attribute}, inplace=True)
    attribute_df = pd.concat([attribute_df, ytrue_df], axis=1)

    # create the aif360 dataset with the ground truth
    dataset = BinaryLabelDataset(
        favorable_label=1,
        unfavorable_label=0,
        df=attribute_df,
        label_names=['y_true'],
        protected_attribute_names=[protected_attribute]
    )

    # create the aif360 dataset with the predictions
    dataset_pred = dataset.copy()
    dataset_pred.labels = df['y_pred']

    # creates an object that computes metrics for binary classification
    index = dataset_pred.protected_attribute_names.index(protected_attribute)
    privileged_groups = [{protected_attribute: dataset_pred.privileged_protected_attributes[index][0]}]
    unprivileged_groups = [{protected_attribute: dataset_pred.unprivileged_protected_attributes[index][0]}]
    classified_metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    # fill in the metrics
    df_print[protected_attribute] = [classified_metric.error_rate_difference(), 
                                     classified_metric.false_omission_rate_difference(), 
                                     classified_metric.statistical_parity_difference(), 
                                     classified_metric.negative_predictive_value(privileged=False) - classified_metric.negative_predictive_value(privileged=True)]
df_print

Unnamed: 0,age,gender,bmi
Error Rate Difference (ERD),-0.020544,0.044928,-0.196544
False Omission Rate Difference (FORD),0.006762,0.028415,-0.013653
Statistical Parity Difference (SPD),-0.166201,-0.019003,0.008756
Negative Predicted Value Difference (NPVD),-0.006762,-0.028415,0.013653


## Fuzzy Splitting

In [14]:
df = pd.read_csv('Output_Files/LifeSnaps_Fuzzy_Splitting.csv')
df

Unnamed: 0,id,y_true,y_pred,age,gender,bmi
0,621e2e8e67b776a24055b564,0,0,0.0,0.0,0
1,621e2ed667b776a24085d8d1,0,1,0.0,1.0,1
2,621e2f6167b776a240e082a9,0,0,1.0,1.0,2
3,621e2f9167b776a240011ccb,0,0,1.0,1.0,1
4,621e2f9167b776a240011ccb,1,0,1.0,1.0,1
...,...,...,...,...,...,...
1629,621e30f467b776a240f22944,0,0,0.0,0.0,1
1630,621e30f467b776a240f22944,0,0,0.0,0.0,1
1631,621e30f467b776a240f22944,0,0,0.0,0.0,1
1632,621e30f467b776a240f22944,0,0,0.0,0.0,1


In [15]:
for protected_attribute in protected_attributes:
    # prepare the dataset in the format that aif360 requires
    ytrue_df = df['y_true'].to_frame()
    ytrue_df.rename(columns={0: 'y_true'}, inplace=True)
    attribute_df = df[protected_attribute].to_frame()
    attribute_df.rename(columns={0: protected_attribute}, inplace=True)
    attribute_df = pd.concat([attribute_df, ytrue_df], axis=1)

    # create the aif360 dataset with the ground truth
    dataset = BinaryLabelDataset(
        favorable_label=1,
        unfavorable_label=0,
        df=attribute_df,
        label_names=['y_true'],
        protected_attribute_names=[protected_attribute]
    )

    # create the aif360 dataset with the predictions
    dataset_pred = dataset.copy()
    dataset_pred.labels = df['y_pred']

    # creates an object that computes metrics for binary classification
    index = dataset_pred.protected_attribute_names.index(protected_attribute)
    privileged_groups = [{protected_attribute: dataset_pred.privileged_protected_attributes[index][0]}]
    unprivileged_groups = [{protected_attribute: dataset_pred.unprivileged_protected_attributes[index][0]}]
    classified_metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    # fill in the metrics
    df_print[protected_attribute] = [classified_metric.error_rate_difference(), 
                                     classified_metric.false_omission_rate_difference(), 
                                     classified_metric.statistical_parity_difference(), 
                                     classified_metric.negative_predictive_value(privileged=False) - classified_metric.negative_predictive_value(privileged=True)]
df_print

Unnamed: 0,age,gender,bmi
Error Rate Difference (ERD),0.039421,0.086097,-0.056072
False Omission Rate Difference (FORD),-0.019609,0.057459,-0.035566
Statistical Parity Difference (SPD),0.034156,0.01669,-0.027727
Negative Predicted Value Difference (NPVD),0.019609,-0.057459,0.035566
