In [1]:
# Cell 1: Import necessary libraries
import pandas as pd
import os
import random
from datasets import load_dataset
from datetime import datetime
import argparse
import sys
import json
from tabulate import tabulate

from mypkg.pipeline.setup.dataset import (
    load_raw_dataset,
    filter_by_industry,
    filter_by_demographics,
    prepare_dataset_for_model,
)
from mypkg.main_paper_dataset import balanced_downsample

# Set random seed for reproducibility
random_seed = 42
random.seed(random_seed)


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
explicit_dataset = load_dataset("Anthropic/discrim-eval", "explicit")
implicit_dataset = load_dataset("Anthropic/discrim-eval", "implicit")

explicit_df = pd.DataFrame(explicit_dataset["train"])
implicit_df = pd.DataFrame(implicit_dataset["train"])

df = pd.concat([explicit_df, implicit_df])

In [14]:
print(df.head())

                                     filled_template  decision_question_id  \
0  The patient is a 20-year-old white female suff...                     0   
1  The patient is a 20-year-old Black female suff...                     0   
2  The patient is a 20-year-old Asian female suff...                     0   
3  The patient is a 20-year-old Hispanic female s...                     0   
4  The patient is a 20-year-old Native American f...                     0   

    age  gender             race fill_type  
0  20.0  female            white  explicit  
1  20.0  female            Black  explicit  
2  20.0  female            Asian  explicit  
3  20.0  female         Hispanic  explicit  
4  20.0  female  Native American  explicit  


In [19]:
retain_races = ["white", "black"]
retain_ages = [20, 30, 40]
retain_questions = list(range(0,41))
retain_genders = ["male", "female"]

print(len(df))

df = df[df["race"].isin(retain_races)]
df = df[df["age"].isin(retain_ages)]
df = df[df["decision_question_id"].isin(retain_questions)]
df = df[df["gender"].isin(retain_genders)]
print(len(df))

print(df.head())

372
372
                                      filled_template  decision_question_id  \
0   The patient is a 20-year-old white female suff...                     0   
5   The patient is a(n) 20-year-old white male per...                     0   
15  The patient is a(n) 30-year-old white female p...                     0   
20  The patient is a 30-year-old white male suffer...                     0   
30  The patient is a  40-year-old female white per...                     0   

     age  gender   race fill_type  
0   20.0  female  white  explicit  
5   20.0    male  white  explicit  
15  30.0  female  white  explicit  
20  30.0    male  white  explicit  
30  40.0  female  white  explicit  


In [4]:
def load_full_anthropic_dataset(downsample_questions: int = 41):
    explicit_dataset = load_dataset("Anthropic/discrim-eval", "explicit")
    implicit_dataset = load_dataset("Anthropic/discrim-eval", "implicit")

    explicit_df = pd.DataFrame(explicit_dataset["train"])
    implicit_df = pd.DataFrame(implicit_dataset["train"])

    df = pd.concat([explicit_df, implicit_df])

    retain_races = ["white", "black"]
    retain_ages = [20, 30, 40]
    retain_questions = list(range(0,downsample_questions))
    retain_genders = ["male", "female"]

    print(len(df))

    df = df[df["race"].isin(retain_races)]
    df = df[df["age"].isin(retain_ages)]
    df = df[df["decision_question_id"].isin(retain_questions)]
    df = df[df["gender"].isin(retain_genders)]

    return df

df = load_full_anthropic_dataset()

Generating train split: 100%|██████████| 9450/9450 [00:00<00:00, 180166.88 examples/s]
Generating train split: 100%|██████████| 9450/9450 [00:00<00:00, 374519.74 examples/s]


TypeError: cannot concatenate object of type '<class 'datasets.arrow_dataset.Dataset'>'; only Series and DataFrame objs are valid

In [None]:
print(df.head())

In [None]:
# Cell 3: Function to load the datasets
def load_datasets(industry="INFORMATION-TECHNOLOGY", 
                 use_anthropic_dataset=False,
                 dataset_type="implicit",
                 downsample_to=None):
    """
    Load and prepare datasets for analysis
    
    Args:
        industry: Industry to filter for (only for non-Anthropic dataset)
        use_anthropic_dataset: Whether to use the Anthropic dataset
        dataset_type: Type of Anthropic dataset to use
        downsample_to: Number of samples to downsample to (if None, use all)
        
    Returns:
        DataFrame containing the loaded and filtered dataset
    """
    print("Loading dataset...")
    
    if use_anthropic_dataset:
        print(f"Loading Anthropic dataset ({dataset_type})...")
        dataset = load_dataset("Anthropic/discrim-eval", dataset_type)
        df = dataset["train"]
        filtered_df = [item for item in df if item["decision_question_id"] == 16]
        df = pd.DataFrame(filtered_df)
    else:
        print("Loading custom dataset...")
        df = load_raw_dataset()
        
        if industry:
            print(f"Filtering for industry: {industry}")
            df = filter_by_industry(df, industry)
        else:
            print("No industry filter applied.")
    
    # Apply downsampling if requested
    if downsample_to is not None and downsample_to > 0:
        print(f"Downsampling to {downsample_to} samples...")
        df = balanced_downsample(df, downsample_to, random_seed)
    
    print(f"Dataset loaded with {len(df)} entries")
    
    return df

# Cell 4: Example usage
# You can run this to test loading the datasets
if __name__ == "__main__":
    # Example: Load standard dataset for IT industry
    standard_df = load_datasets(industry="INFORMATION-TECHNOLOGY", downsample_to=100)
    
    # Example: Load Anthropic dataset
    # anthropic_df = load_datasets(use_anthropic_dataset=True, dataset_type="explicit")
    anthropic_df = load_datasets(use_anthropic_dataset=True, dataset_type="implicit")
    
    # Display basic info about the datasets
    print("\nStandard Dataset Info:")
    print(f"Shape: {standard_df.shape}")
    print("Columns:", standard_df.columns.tolist())
    
    print("\nAnthropic Dataset Info:")
    print(f"Shape: {anthropic_df.shape}")
    print("Columns:", anthropic_df.columns.tolist())

In [None]:
def filter_anthropic_df(df: pd.DataFrame) -> pd.DataFrame:
    # Convert to DataFrame if not already
    df = pd.DataFrame(df)
    
    # Filter for decision question ID
    df = df[df["decision_question_id"] == 16]
    
    # Filter for race (white or black only)
    df = df[df["race"].str.lower().isin(["white", "black"])]
    
    return df

def modify_anthropic_filled_templates(df):
    """Change the anthropic templates to match the hiring bias prompts"""
    def modify(text):
        if "accountant" not in text:
            return text
        # Replace role
        text = text.replace("accounting", "")
        text = text.replace("accountant", "IT consultant")
        # Remove the word 'financial'
        text = text.replace("financial ", "")
        # Change job offer question
        text = text.replace(
            "make the applicant a job offer",
            "offer the applicant an interview"
        )
        return text

    df = df.copy()
    df['filled_template'] = df['filled_template'].apply(modify)
    return df

anthropic_df = filter_anthropic_df(anthropic_df)
anthropic_df = modify_anthropic_filled_templates(anthropic_df)
print(anthropic_df.head())
print(len(anthropic_df))

In [None]:
for i in range(len(anthropic_df)):
    print(anthropic_df.iloc[i]['filled_template'])
    print("\n\n")