In [173]:
# NOTES
# May need to install sklearn and aif360 libraries
# at CLI
# pip install aif360
# pip install scikit-learn
#
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing import Reweighing


In [174]:
# Load dataset
df = pd.read_csv("../data/adult.csv")


In [175]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [176]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

In [177]:
# Basic preprocessing, note that column names may need to be adjusted depending on
# the datset's structure. 
df = pd.get_dummies(df, columns=["workclass", "education", "marital-status", "occupation", "relationship", "race", "native-country"], drop_first=True)
df['income'] = df['income'].apply(lambda x: 1 if x == " >50K" else 0)
df['sex'] = df['sex'].apply(lambda x: 1 if x == " Male" else 0)


In [193]:
df.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,income,workclass_ Federal-gov,workclass_ Local-gov,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,1,2174,0,40,0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,50,83311,13,1,0,0,13,0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,38,215646,9,1,0,0,40,0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,53,234721,7,1,0,0,40,0,False,False,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,13,0,0,0,40,0,False,False,...,False,False,False,False,False,False,False,False,False,False


In [179]:
# Train/test split
X = df.drop("income", axis=1)
y = df["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [180]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [181]:
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8006960794349472


In [182]:
# Calculate fairness metrics (e.g., demographic parity, equalized odds)
# Example of reweighting (mitigation technique)
# Use AIF360 library to define dataset and apply reweighing

dataset = StandardDataset(df, label_name="income", favorable_classes=[1], protected_attribute_names=["sex"], privileged_classes=[[1]])
RW = Reweighing(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}])
dataset_transf = RW.fit_transform(dataset)


Interpret the results

Disparate impact:

A value of 1 indicates fairness, values less than 1 indicate bias against the unprivileged group, and values greater than 1 indicate bias against the privileged group.

Statistical parity difference:

Measures the difference in positive prediction rates between privileged and unprivileged groups. A value of 0 indicates fairness.

Equal opportunity difference:

Measures the difference in true positive rates between privileged and unprivileged groups. A value of 0 indicates fairness.

In [192]:
metric = BinaryLabelDatasetMetric(dataset,
                                  unprivileged_groups=[{'sex': 0}],
                                  privileged_groups=[{'sex': 1}])

# Compute metrics like mean difference
mean_difference = metric.mean_difference()
print("Mean difference:", mean_difference)

# Disparate impact
disparate_impact = metric.disparate_impact()
print("Disparate impact:", disparate_impact)

# Statistical parity difference
statistical_parity_difference = metric.statistical_parity_difference()
print("Statistical parity difference:", statistical_parity_difference)

# Individual Fairness Consistency
print("Consistency (Individual Fairness)", metric.consistency())


Mean difference: -0.19627598779361352
Disparate impact: 0.3580225496813511
Statistical parity difference: -0.19627598779361352


Consistency (Individual Fairness) [0.76956482]
