<a href="https://colab.research.google.com/github/cclaire0325/cclaire0325.github.io/blob/main/AIES_finalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import the 'drive' module from the 'google.colab' library.
# mount Google Drive to the virtual machine running the Colab notebook.
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/archive/Data.csv'

import numpy as np

# Load the dataset into a pandas DataFrame
import pandas as pd
data = pd.read_csv(file_path)
# Creates a deep copy of the data DataFrame and stores it in 'a new DataFrame' called 'original_data'.
# a deep copy ensures that any changes made to 'data' will not affect original_data, and vice versa.
# This is useful to preserve the original data for reference or comparison while working with a copy of the data.
original_data = data.copy(deep=True)


# **STEP 2**


In [None]:
# First, check the unique values to understand the categories
print(data['Applicant_Gender'].unique())
print(sorted(data['Applicant_Age'].unique()))

# Count the number of males and females
gender_counts = data['Applicant_Gender'].value_counts()
print(gender_counts)

# Count the number of individuals <45 years old and >=45
age_below_45 = (data['Applicant_Age'] < 45).sum()
age_45_and_above = (data['Applicant_Age'] >= 45).sum()

print(f"Number of individuals < 45 years old: {age_below_45}")
print(f"Number of individuals >= 45 years old: {age_45_and_above}")

['M      ' 'F      ']
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68]
F          15627
M           9501
Name: Applicant_Gender, dtype: int64
Number of individuals < 45 years old: 16166
Number of individuals >= 45 years old: 8962


Step 2.2: Discretize Dependent Variables

'Total_Income' and 'Years_of_Working' are continuous and bin them



In [None]:
# First, check the unique values to understand the categories
print(sorted(data['Total_Income'].unique()))
print(sorted(data['Years_of_Working'].unique()))

# Count the number of people with 'Total_Income' less than 180,000 and larger than or equal to 180,000
# .sum() function counts the number of True values in each boolean condition to give the total counts for each category.
income_below_180000 = (data['Total_Income'] < 180000).sum()
income_180000_and_above = (data['Total_Income'] >= 180000).sum()

print(f"Number of people with 'Total_Income' < 180000: {income_below_180000}")
print(f"Number of people with 'Total_Income' >= 180000: {income_180000_and_above}")

# Count the number of people with 'Years_of_Working' less than 7 years and 7 years or more
working_less_than_7_years = (data['Years_of_Working'] < 7).sum()
working_7_years_and_more = (data['Years_of_Working'] >= 7).sum()

print(f"People with 'Years_of_Working' < 7 years: {working_less_than_7_years}")
print(f"People with 'Years_of_Working' >= 7 years: {working_7_years_and_more}")




[27000, 31500, 32400, 36000, 36900, 40500, 45000, 47250, 49500, 51750, 54000, 58500, 60377, 60750, 62654, 63000, 67500, 69372, 69750, 72000, 73350, 74061, 74250, 76500, 78750, 81000, 82350, 85500, 87750, 90000, 90900, 91530, 94500, 95850, 99000, 101250, 103500, 105750, 108000, 112500, 114750, 115200, 116100, 116654, 117000, 118350, 119250, 120150, 121500, 122400, 123750, 124200, 126000, 128543, 129150, 130500, 132372, 132750, 134996, 135000, 139500, 140400, 140850, 143100, 144000, 145350, 146250, 147150, 148500, 153000, 155250, 157500, 159750, 160200, 162000, 164250, 166500, 168750, 170303, 171000, 171261, 173250, 175500, 176175, 176400, 178200, 179100, 179271, 180000, 184136, 184500, 185400, 189000, 190400, 191250, 191700, 193500, 195750, 196650, 198000, 202500, 207000, 211500, 213750, 215100, 215550, 216000, 220500, 222750, 225000, 227250, 229500, 234000, 234135, 238500, 240750, 243000, 247500, 252000, 253800, 254700, 256500, 257625, 261000, 265500, 265581, 267750, 270000, 274500, 27

Step 2.4  Cross-tabulation of each protected class variable that shows the frequency values of its membership categories as a function of the dependent variables

In [None]:
# 2.4.1 Frequency of 'Applicant_Age' (< 45 years and >= 45 years) with 'Total_Income' (< 180000 and >= 180000):
# Create a new column for age categories
data['Age_Category'] = data['Applicant_Age'].apply(lambda x: '< 45' if x < 45 else '>= 45')
print(data['Age_Category'].head())

# Create a new column for income categories
data['Income_Category'] = data['Total_Income'].apply(lambda x: '< 180000' if x < 180000 else '>= 180000')
print(data['Income_Category'].head())

# Cross-tabulation of Age Category and Income Category
# the cells of the table will contain the counts of how many times each combination of 'Age_Category' and 'Income_Category' occurs in dataset.
age_income_frequency = pd.crosstab(data['Age_Category'], data['Income_Category'])
print(age_income_frequency)

#2.4.2 Frequency of 'Applicant_Age' (< 45 years and >= 45 years) with 'Years_of_Working' (< 7 years and >= 7 years):
# Create a new column for working years categories
data['Working_Years_Category'] = data['Years_of_Working'].apply(lambda x: '< 7' if x < 7 else '>= 7')

# Cross-tabulation of Age Category and Working Years Category
age_working_frequency = pd.crosstab(data['Age_Category'], data['Working_Years_Category'])
print(age_working_frequency)

# 2.4.3 Frequency of 'Applicant_Gender' (female and male) with 'Total_Income' (< 180000 and >= 180000):
gender_income_frequency = pd.crosstab(data['Applicant_Gender'], data['Income_Category'])
print(gender_income_frequency)

# 2.4.4 Frequency of 'Applicant_Gender' (female and male) with 'Years_of_Working' (< 7 years and >= 7 years):
# Cross-tabulation of Gender and Working Years Category
gender_working_frequency = pd.crosstab(data['Applicant_Gender'], data['Working_Years_Category'])
print(gender_working_frequency)


0    >= 45
1    >= 45
2    >= 45
3    >= 45
4    >= 45
Name: Age_Category, dtype: object
0     < 180000
1    >= 180000
2    >= 180000
3    >= 180000
4    >= 180000
Name: Income_Category, dtype: object
Income_Category  < 180000  >= 180000
Age_Category                        
< 45                 8205       7961
>= 45                4143       4819
Working_Years_Category   < 7  >= 7
Age_Category                      
< 45                    9636  6530
>= 45                   4061  4901
Income_Category   < 180000  >= 180000
Applicant_Gender                     
F                     8975       6652
M                     3373       6128
Working_Years_Category   < 7  >= 7
Applicant_Gender                  
F                       8044  7583
M                       5653  3848


# **STEP 3**

Step 3.1

Identify Privileged/Unprivileged Groups

*   Privileged group(1) : Male; >= 45 years old
*   Unprivileged group(0): Female; < 45 years old

Convert the 'Total_Income' and 'Years_of_Working' into binary values of 0 and 1

*   1 indicates the favorable outcome (e.g., 'Total_Income' >= 180000 'Years_of_Working' >= 7)
*   0 indicates the unfavorable outcome





In [None]:
# Add new columns to 'data' DataFrame that represent the binary versions of the dependent variables and protected attributes.
# Convert 'Total_Income' and 'Years_of_Working' into binary values
# .astype(int) part of the code converts the True/False values to 1/0.
data['Total_Income_Binary'] = (data['Total_Income'] >= 180000).astype(int)  # 1 for >= 180000, otherwise 0
data['Years_of_Working_Binary'] = (data['Years_of_Working'] >= 7).astype(int)  # 1 for >= 7 years, otherwise 0

# Defining binary variables for protected attributes
print(data['Applicant_Gender'].unique()) # Check unique values in the 'Applicant_Gender' column
data['Applicant_Gender_Binary'] = (data['Applicant_Gender'] == 'M      ').astype(int)  # 1 for Male, otherwise 0
data['Applicant_Age_Binary'] = (data['Applicant_Age'] >= 45).astype(int)  # 1 for age >= 45, otherwise 0

# Verify the binary conversion
print(data[['Applicant_Gender_Binary', 'Total_Income_Binary', 'Applicant_Age_Binary', 'Years_of_Working_Binary']].head(10))


['M      ' 'F      ']
   Applicant_Gender_Binary  Total_Income_Binary  Applicant_Age_Binary  \
0                        1                    0                     1   
1                        0                    1                     1   
2                        0                    1                     1   
3                        0                    1                     1   
4                        0                    1                     1   
5                        1                    1                     1   
6                        1                    0                     1   
7                        1                    0                     1   
8                        1                    0                     1   
9                        1                    0                     1   

   Years_of_Working_Binary  
0                        0  
1                        1  
2                        1  
3                        1  
4                        1  


Step 3.2 Selecting and Computing Fairness Metrics

*   Statistical Parity Difference: Measures the difference in the rate of favorable outcomes received by the unprivileged group to the privileged group.

*   Disparate Impact: Measures the ratio of rates of favorable outcome for the unprivileged group to that of the privileged group.



In [None]:
def compute_fairness_metrics(data, protected_attribute, outcome_var):
    # Calculate the probabilities for positive outcomes for privileged and unprivileged groups
    prob_privileged = data[data[protected_attribute] == 1][outcome_var].mean()
    prob_unprivileged = data[data[protected_attribute] == 0][outcome_var].mean()

    # Calculate Statistical Parity Difference
    spd = prob_unprivileged - prob_privileged

    # Calculate Disparate Impact
    di = prob_unprivileged / prob_privileged

    return spd, di

# Define your protected attributes and outcome variables
protected_attributes = ['Applicant_Gender_Binary', 'Applicant_Age_Binary']
outcome_variables = ['Total_Income_Binary', 'Years_of_Working_Binary']

# Create a dictionary to hold the results
fairness_metrics = {}

# Iterate over each combination of protected attributes (gender/age) and outcome variables(years of working/total income)
for protected in protected_attributes:
    for outcome in outcome_variables:
        spd, di = compute_fairness_metrics(data, protected, outcome)
        fairness_metrics[(protected, outcome)] = {'SPD': spd, 'DI': di}

# Now, 'fairness_metrics dictionary' contains all the SPD and DI values for each combination
# Print the results
for keys, values in fairness_metrics.items():
    print(f"For protected attribute '{keys[0]}' and outcome variable '{keys[1]}':")
    print(f"Statistical Parity Difference: {values['SPD']}")
    print(f"Disparate Impact: {values['DI']}\n")



For protected attribute 'Applicant_Gender_Binary' and outcome variable 'Total_Income_Binary':
Statistical Parity Difference: -0.21931122465834957
Disparate Impact: 0.6599745519779734

For protected attribute 'Applicant_Gender_Binary' and outcome variable 'Years_of_Working_Binary':
Statistical Parity Difference: 0.08023988906685497
Disparate Impact: 1.1981182915863278

For protected attribute 'Applicant_Age_Binary' and outcome variable 'Total_Income_Binary':
Statistical Parity Difference: -0.04526149876133084
Disparate Impact: 0.915826197987332

For protected attribute 'Applicant_Age_Binary' and outcome variable 'Years_of_Working_Binary':
Statistical Parity Difference: -0.14293035631246376
Disparate Impact: 0.7386366347128545



Step 3.3: Pre-processing Bias Mitigation Algorithm

Select Reweighing algorithm.

The Reweighing algorithm is a pre-processing technique that assigns weights to the examples in the original dataset to ensure fairness before model training. The algorithm works by adjusting the weights such that the difference in distributions of the protected attribute across the favorable and unfavorable outcomes is reduced.

Step 3.4: Computing Fairness Metrics on the Transformed Dataset

After applying the bias mitigation technique, recompute the fairness metrics SDP/DI to see if the bias has been reduced.

In [None]:
# install the AI Fairness 360 (AIF360) library
!pip install aif360


Collecting aif360
  Downloading aif360-0.5.0-py3-none-any.whl (214 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/214.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m204.8/214.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.1/214.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: aif360
Successfully installed aif360-0.5.0


In [None]:
# print(data.head())
dataStep3 = data[['Total_Income_Binary', 'Years_of_Working_Binary', 'Applicant_Gender_Binary', 'Applicant_Age_Binary']].copy()
print(dataStep3.head())

   Total_Income_Binary  Years_of_Working_Binary  Applicant_Gender_Binary  \
0                    0                        0                        1   
1                    1                        1                        0   
2                    1                        1                        0   
3                    1                        1                        0   
4                    1                        1                        0   

   Applicant_Age_Binary  
0                     1  
1                     1  
2                     1  
3                     1  
4                     1  


In [None]:
# The code below related to fairness in machine learning, specifically using the AI Fairness 360 (AIF360) toolkit from IBM,
# which is a comprehensive set of tools designed to help detect and mitigate bias in machine learning models.
# The specific functionality used here is to reweight instances in a dataset to create fairness with respect to certain protected attributes.

from aif360.algorithms.preprocessing import Reweighing
from aif360.datasets import BinaryLabelDataset # BinaryLabelDataset is a class that provides a structured way to handle datasets with binary labels in the AIF360 toolkit.

# 1. Defining Variables and Groups
# Define the outcome variables and the protected attributes
outcome_variables = ['Total_Income_Binary', 'Years_of_Working_Binary'] # 'outcome_variables' are specific features that the model is ultimately trying to predict or assess the fairness of.
protected_attributes = ['Applicant_Gender_Binary', 'Applicant_Age_Binary'] # 'protected_attributes' are the features that represent sensitive attributes.

# Define unprivileged and privileged groups
unprivileged_groups = [{'Applicant_Gender_Binary': 0}, {'Applicant_Age_Binary': 0}]
privileged_groups = [{'Applicant_Gender_Binary': 1}, {'Applicant_Age_Binary': 1}]

# 2. Initialize reweighing instance
# An instance of the 'Reweighing class' is initialized with the previously defined unprivileged and privileged groups.
# This object 'reweighing' will be used later to adjust the weights of the dataset's instances to compensate for bias.
reweighing = Reweighing(unprivileged_groups=unprivileged_groups,
                         privileged_groups=privileged_groups)

# 3. Define a Function to Apply Reweighing
# Below function takes an 'outcome_var' (one of the outcome variables for assessing for fairness) and a dataset 'data' as input.
# The function uses the BinaryLabelDataset class to structure the data properly,
# applies the reweighing algorithm to adjust the instance weights for fairness,
# and returns the reweighted data as a pandas DataFrame including the new weights.
def apply_reweighing(outcome_var, data):
    # Create a BinaryLabelDataset for the given outcome variable
    binary_label_dataset = BinaryLabelDataset(
        favorable_label=1,
        unfavorable_label=0,
        df=data,
        label_names=[outcome_var],
        protected_attribute_names=protected_attributes
    )

    # Apply reweighing
    reweighed_dataset = reweighing.fit_transform(binary_label_dataset)

    # Convert back to pandas DataFrame
    reweighed_data = pd.DataFrame(np.hstack([reweighed_dataset.features, reweighed_dataset.labels]),
                                  columns=binary_label_dataset.feature_names + binary_label_dataset.label_names)
    reweighed_data['weights'] = reweighed_dataset.instance_weights
    return reweighed_data # return a pandas DataFrame

# 4. Apply the reweighing
reweighed_data_dict = {} # This dictionary will be used to store the reweighed data for each outcome variable.
# The code loops over each outcome_var and applies the reweighing function to the data, storing the results in the 'reweighed_data_dict' dictionary.
for outcome_var in outcome_variables:
    reweighed_data = apply_reweighing(outcome_var, dataStep3)
    reweighed_data_dict[outcome_var] = reweighed_data
    # 'reweighed_data_dict' dictionary has two keys,'Total_Income_Binary' and 'Years_of_Working_Binary'.
    # The values associated with each key are pandas DataFrames that have been processed through the apply_reweighing function.
# print(reweighed_data_dict)

# 5. Define a Function to Compute Fairness Metrics
def compute_fairness_metrics_weighted(data, protected_attribute, outcome_var, weights):
    # Calculate the probabilities for positive outcomes for privileged and unprivileged groups
    weighted_means = data.groupby(protected_attribute).apply(
        lambda x: np.average(x[outcome_var], weights=x[weights])
    )
    prob_privileged = weighted_means.loc[1]
    prob_unprivileged = weighted_means.loc[0]

    # Calculate Statistical Parity Difference
    spd = prob_unprivileged - prob_privileged

    # Calculate Disparate Impact
    di = prob_unprivileged / prob_privileged

    return spd, di

fairness_metrics = {} # keys are tuples consisting of the protected attribute and outcome variable.
for outcome_var in outcome_variables:
    reweighed_data = reweighed_data_dict[outcome_var]
    for protected_attr in protected_attributes:
        spd, di = compute_fairness_metrics_weighted(reweighed_data, protected_attr, outcome_var, 'weights')
        fairness_metrics[(protected_attr, outcome_var)] = {'SPD': spd, 'DI': di}
print(fairness_metrics)

# Print the fairness metrics results
for (protected_attr, outcome_var), metrics in fairness_metrics.items():
    print(f"Outcome Variable: {outcome_var}")
    print(f"Protected Attribute: {protected_attr}")
    print(f"Statistical Parity Difference: {metrics['SPD']}")
    print(f"Disparate Impact: {metrics['DI']}\n")


{('Applicant_Gender_Binary', 'Total_Income_Binary'): {'SPD': -0.1716576795313604, 'DI': 0.7091854519042328}, ('Applicant_Age_Binary', 'Total_Income_Binary'): {'SPD': 0.005121482932355148, 'DI': 1.010681348440465}, ('Applicant_Gender_Binary', 'Years_of_Working_Binary'): {'SPD': 0.09123349213350451, 'DI': 1.2326536472606986}, ('Applicant_Age_Binary', 'Years_of_Working_Binary'): {'SPD': -0.1314732475371248, 'DI': 0.7535717440087893}}
Outcome Variable: Total_Income_Binary
Protected Attribute: Applicant_Gender_Binary
Statistical Parity Difference: -0.1716576795313604
Disparate Impact: 0.7091854519042328

Outcome Variable: Total_Income_Binary
Protected Attribute: Applicant_Age_Binary
Statistical Parity Difference: 0.005121482932355148
Disparate Impact: 1.010681348440465

Outcome Variable: Years_of_Working_Binary
Protected Attribute: Applicant_Gender_Binary
Statistical Parity Difference: 0.09123349213350451
Disparate Impact: 1.2326536472606986

Outcome Variable: Years_of_Working_Binary
Protec