<a href="https://colab.research.google.com/github/banerRana/hiringTrendBias/blob/main/trackHiringTrend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Step 1: Install required libraries
!pip install aif360 -q
!pip install fairlearn -q

# Step 2: Import necessary modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing import Reweighing
from fairlearn.metrics import demographic_parity_difference
import matplotlib.pyplot as plt

# Step 3: Create synthetic dataset (replace with real data)
data = {
    'resume_text': [
        'experienced python developer with machine learning background',
        'java expert with five years enterprise experience',
        'recent graduate with data science internship',
        'senior software engineer cloud computing',
        'web developer javascript react node',
        'mobile developer ios android swift'
    ],
    'gender': ['male', 'female', 'male', 'male', 'female', 'male'],  # Protected attribute
    'qualified': [1, 1, 0, 1, 0, 1]  # Target variable
}

df = pd.DataFrame(data)

# Step 4: Preprocess data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['resume_text'])
y = df['qualified']
protected = df['gender']

# Step 5: Train initial model
X_train, X_test, y_train, y_test, prot_train, prot_test = train_test_split(
    X, y, protected, test_size=0.3, random_state=42
)

model = LogisticRegression()
model.fit(X_train, y_train)

# Step 6: Evaluate bias
test_pred = model.predict(X_test)
print(f"Accuracy: {model.score(X_test, y_test):.2f}")

# Convert to AIF360 dataset
privileged_group = [{'gender': 1}]
unprivileged_group = [{'gender': 0}]

# Create a DataFrame with the necessary columns for BinaryLabelDataset
test_df = pd.DataFrame(X_test.toarray())
# Map 'gender' to numerical values: 1 for 'male', 0 for 'female'
gender_mapping = {'male': 1, 'female': 0}
test_df['gender'] = prot_test.map(gender_mapping)  # Add the 'gender' column with numerical values
test_df['qualified'] = y_test  # Add the 'qualified' column

dataset_test = BinaryLabelDataset(
    df=test_df,  # Use the updated DataFrame
    protected_attribute_names=['gender'],
    label_names=['qualified']
)

# Calculate fairness metrics
metric = ClassificationMetric(
    dataset_test,
    dataset_test,  # Remove label_names argument and use original dataset
    unprivileged_groups=unprivileged_group,
    privileged_groups=privileged_group
)

print(f"Disparate Impact: {metric.disparate_impact():.2f}")
print(f"Statistical Parity Difference: {metric.statistical_parity_difference():.2f}")

# Step 7: Apply bias mitigation
# Reweighing algorithm
RW = Reweighing(unprivileged_groups=unprivileged_group,
               privileged_groups=privileged_group)
dataset_train = BinaryLabelDataset(
    df=pd.DataFrame(X_train.toarray()),
    protected_attribute_names=['gender'],
    label_names=['qualified']
)
dataset_trans = RW.fit_transform(dataset_train)

# Train new model with transformed data
model_fair = LogisticRegression()
model_fair.fit(dataset_trans.features, dataset_trans.labels.ravel())

# Evaluate mitigated model
test_pred_fair = model_fair.predict(X_test.toarray())
print("\nAfter Bias Mitigation:")
print(f"Accuracy: {model_fair.score(X_test.toarray(), y_test):.2f}")

# Fairlearn metrics
print(f"Demographic Parity Difference: {demographic_parity_difference(y_test, test_pred_fair, sensitive_features=prot_test):.2f}")

# Step 8: Visualization
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Before mitigation
ax[0].bar(['Male', 'Female'],
         [test_pred[prot_test == 'male'].mean(),
          test_pred[prot_test == 'female'].mean()])
ax[0].set_title('Selection Rate Before Mitigation')

# After mitigation
ax[1].bar(['Male', 'Female'],
         [test_pred_fair[prot_test == 'male'].mean(),
          test_pred_fair[prot_test == 'female'].mean()])
ax[1].set_title('Selection Rate After Mitigation')

plt.show()


Accuracy: 0.00
Disparate Impact: 1.00
Statistical Parity Difference: 0.00


KeyError: "None of [Index(['qualified'], dtype='object')] are in the [columns]"

In [2]:
# Convert to AIF360 dataset
privileged_group = [{'gender': 'male'}]
unprivileged_group = [{'gender': 'female'}]

# Create a DataFrame with the necessary columns for BinaryLabelDataset
test_df = pd.DataFrame(X_test.toarray())
test_df['gender'] = prot_test  # Add the 'gender' column
test_df['qualified'] = y_test  # Add the 'qualified' column

dataset_test = BinaryLabelDataset(
    df=test_df,  # Use the updated DataFrame
    protected_attribute_names=['gender'],
    label_names=['qualified']
)

ValueError: could not convert string to float: 'male'


ValueError: DataFrame values must be numerical.

In [3]:
# Convert to AIF360 dataset
privileged_group = [{'gender': 1}]
unprivileged_group = [{'gender': 0}]

# Create a DataFrame with the necessary columns for BinaryLabelDataset
test_df = pd.DataFrame(X_test.toarray())
# Map 'gender' to numerical values: 1 for 'male', 0 for 'female'
gender_mapping = {'male': 1, 'female': 0}
test_df['gender'] = prot_test.map(gender_mapping)  # Add the 'gender' column with numerical values
test_df['qualified'] = y_test  # Add the 'qualified' column

dataset_test = BinaryLabelDataset(
    df=test_df,  # Use the updated DataFrame
    protected_attribute_names=['gender'],
    label_names=['qualified']
)