# Dataset Creation

In [2]:
# imports
import json
import pandas as pd
import os

In [15]:
data_path = "../data/bios/en/bias_in_bios.json"
# occupations = ['surgeon', 'nurse'] #, 'physician']
occupations = []  # empty list means all occupations
balance_sensitive = True

data = []
with open(data_path, 'r') as f:
    for line in f:
        if line.strip():  # skip empty lines
            data.append(json.loads(line))
print(f"Number of samples: {len(data)}")

# Filter data for selected occupations
if occupations:
    data = [d for d in data if d['profession'] in occupations]
    df = pd.DataFrame(data)
    print(f"Number of samples after filtering for occupations {occupations}: {len(df)}")
else:
    df = pd.DataFrame(data)
df.head()

# # For each profession, print the gender distribution
# print("Gender distribution per profession:")
# print(df.groupby(['profession', 'gender']).size())


if balance_sensitive:
    # Balance the dataset so that within each occupation, each gender is equally represented
    df = df.groupby(['profession', 'gender'])
    # For each group, sample the minimum count
    min_count = df.size().groupby(level=0).min()
    balanced_df = []
    for ((profession, gender), group) in df:
        n_samples = min(min_count[profession],100)
        balanced_group = group.sample(n=n_samples, random_state=42)
        balanced_df.append(balanced_group)
    df = pd.concat(balanced_df).reset_index(drop=True)
    print("After balancing sensitive attribute:")
    print(df.groupby(['profession', 'gender']).size())
else:
    df = df.groupby('profession').apply(lambda x: x.sample(n=4000, random_state=42)).reset_index(drop=True)

# Create an 80-20 train-test split based on the 'gender' attribute
from sklearn.model_selection import train_test_split
# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df[['profession','gender']], random_state=42)
print(f"Train samples: {len(train_df)}, Test samples: {len(test_df)}")

# Save train and test splits as csv files
data_dir = os.path.dirname(data_path)
if not balance_sensitive:
    train_df.to_csv(os.path.join(data_dir, 'train_split.csv'), index=False)
    test_df.to_csv(os.path.join(data_dir, 'test_split.csv'), index=False)
else:
    train_df.to_csv(os.path.join(data_dir, 'train_split_balanced.csv'), index=False)
    test_df.to_csv(os.path.join(data_dir, 'test_split_balanced.csv'), index=False)

# Check gender and profession distribution in train and test sets
print("Train set distribution:")
print(train_df.groupby(['profession', 'gender']).size())
print("\nTest set distribution:")
print(test_df.groupby(['profession', 'gender']).size())

Number of samples: 396189
After balancing sensitive attribute:
profession         gender
accountant         Female    100
                   Male      100
architect          Female    100
                   Male      100
attorney           Female    100
                   Male      100
chiropractor       Female    100
                   Male      100
comedian           Female    100
                   Male      100
composer           Female    100
                   Male      100
dentist            Female    100
                   Male      100
dietitian          Female    100
                   Male      100
dj                 Female    100
                   Male      100
filmmaker          Female    100
                   Male      100
interior_designer  Female    100
                   Male      100
journalist         Female    100
                   Male      100
model              Female    100
                   Male      100
nurse              Female    100
                   M