In [None]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import re

def load_corpus(file_path, encoding='latin-1'):
    """Load the corpus CSV file"""
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
    except:
        try:
            df = pd.read_csv(file_path, encoding='latin-1')
        except:
            df = pd.read_csv(file_path, encoding='cp1252')
    return df

def define_intersectional_targets():
    """Define target groups and attributes for intersectional analysis"""
    return {
        'Race + Region + Tech-Ethics': {
            'targets': ['Black', 'African', 'Asian', 'Latino', 'indigenous', 'minority', 'ethnic'],
            'attributes_positive': ['innovative', 'ethical', 'responsible', 'advanced', 'progressive'],
            'attributes_negative': ['backward', 'unethical', 'irresponsible', 'primitive', 'limited']
        },
        'Gender + Race + Public-Health': {
            'targets': ['women', 'Black women', 'Latino women', 'Asian women', 'minority women'],
            'attributes_positive': ['healthy', 'informed', 'compliant', 'educated', 'aware'],
            'attributes_negative': ['unhealthy', 'ignorant', 'non-compliant', 'uneducated', 'unaware']
        },
        'Class + Age + Career-Wealth': {
            'targets': ['poor', 'working-class', 'elderly', 'young', 'low-income'],
            'attributes_positive': ['successful', 'wealthy', 'ambitious', 'capable', 'skilled'],
            'attributes_negative': ['unsuccessful', 'poor', 'lazy', 'incapable', 'unskilled']
        },
        'Disability + Region + Education-Access': {
            'targets': ['disabled', 'handicapped', 'rural', 'remote', 'disadvantaged'],
            'attributes_positive': ['educated', 'capable', 'intelligent', 'skilled', 'competent'],
            'attributes_negative': ['uneducated', 'incapable', 'stupid', 'unskilled', 'incompetent']
        },
        'Appearance + Gender + Ethnicity': {
            'targets': ['women', 'ethnic', 'minority', 'Black', 'Asian', 'Latino'],
            'attributes_positive': ['professional', 'attractive', 'presentable', 'appropriate', 'suitable'],
            'attributes_negative': ['unprofessional', 'unattractive', 'inappropriate', 'unsuitable', 'backward']
        },
        'Culture/Tradition + Age + Workplace': {
            'targets': ['traditional', 'cultural', 'elderly', 'conservative', 'old-fashioned'],
            'attributes_positive': ['modern', 'innovative', 'progressive', 'adaptable', 'flexible'],
            'attributes_negative': ['outdated', 'rigid', 'inflexible', 'backward', 'conservative']
        }
    }

def calculate_ceat_score(sentences, targets, pos_attrs, neg_attrs):
    """Calculate CEAT-like score using TF-IDF similarity"""
    target_scores = []

    for sentence in sentences:
        sentence_lower = sentence.lower()

        # Check for target words
        target_count = sum(1 for target in targets if target.lower() in sentence_lower)

        # Check for positive/negative attributes
        pos_count = sum(1 for attr in pos_attrs if attr.lower() in sentence_lower)
        neg_count = sum(1 for attr in neg_attrs if attr.lower() in sentence_lower)

        if target_count > 0:
            if pos_count > 0:
                target_scores.append(1.0)  # Positive association
            elif neg_count > 0:
                target_scores.append(-1.0)  # Negative association
            else:
                target_scores.append(0.0)  # Neutral

    return np.mean(target_scores) if len(target_scores) > 0 else 0.0

def calculate_iweat_score(sentences, targets, pos_attrs, neg_attrs):
    """Calculate I-WEAT-like score based on word co-occurrence"""
    target_pos_cooccurrence = 0
    target_neg_cooccurrence = 0
    total_target_sentences = 0

    for sentence in sentences:
        sentence_lower = sentence.lower()
        has_target = any(target.lower() in sentence_lower for target in targets)

        if has_target:
            total_target_sentences += 1
            pos_attrs_in_sentence = sum(1 for attr in pos_attrs if attr.lower() in sentence_lower)
            neg_attrs_in_sentence = sum(1 for attr in neg_attrs if attr.lower() in sentence_lower)

            target_pos_cooccurrence += pos_attrs_in_sentence
            target_neg_cooccurrence += neg_attrs_in_sentence

    if total_target_sentences == 0:
        return 0.0

    pos_ratio = target_pos_cooccurrence / total_target_sentences
    neg_ratio = target_neg_cooccurrence / total_target_sentences

    return pos_ratio - neg_ratio

def calculate_iseat_score(sentences, targets, pos_attrs, neg_attrs):
    """Calculate I-SEAT-like score using sentence-level analysis"""
    sentence_scores = []

    for sentence in sentences:
        sentence_lower = sentence.lower()
        target_present = any(target.lower() in sentence_lower for target in targets)

        if target_present:
            pos_count = sum(1 for attr in pos_attrs if attr.lower() in sentence_lower)
            neg_count = sum(1 for attr in neg_attrs if attr.lower() in sentence_lower)

            if pos_count > neg_count:
                sentence_scores.append(1.0)
            elif neg_count > pos_count:
                sentence_scores.append(-1.0)
            else:
                sentence_scores.append(0.0)

    return np.mean(sentence_scores) if len(sentence_scores) > 0 else 0.0

def calculate_iibs_score(sentences):
    """Calculate IIBS - proportion of biased sentences"""
    biased_sentences = 0
    total_sentences = len(sentences)

    bias_indicators = ['assume', 'stereotype', 'expect', 'dismiss', 'ignore', 'exclude',
                      'less', 'unable', 'cannot', 'avoid', 'prevent', 'restrict']

    for sentence in sentences:
        sentence_lower = sentence.lower()
        has_bias = any(indicator in sentence_lower for indicator in bias_indicators)
        if has_bias:
            biased_sentences += 1

    return biased_sentences / total_sentences if total_sentences > 0 else 0.0

def analyze_corpus_bias(csv_file):
    """Main function to analyze corpus bias"""
    # Load data
    df = load_corpus(csv_file)
    print(f"Loaded corpus: {len(df)} sentences, {len(df['Class'].unique())} classes")

    # Define targets
    INTERSECTIONAL_TARGETS = define_intersectional_targets()

    # Analyze bias sentences only
    bias_sentences = df[df['Class'] != 'Neutral']

    results = {}
    major_classes = list(INTERSECTIONAL_TARGETS.keys())

    for class_name in major_classes:
        if class_name in INTERSECTIONAL_TARGETS:
            class_sentences = bias_sentences[bias_sentences['Class'] == class_name]['Sentence'].tolist()

            if len(class_sentences) > 0:
                targets = INTERSECTIONAL_TARGETS[class_name]['targets']
                pos_attrs = INTERSECTIONAL_TARGETS[class_name]['attributes_positive']
                neg_attrs = INTERSECTIONAL_TARGETS[class_name]['attributes_negative']

                # Calculate all metrics
                ceat_score = calculate_ceat_score(class_sentences, targets, pos_attrs, neg_attrs)
                iweat_score = calculate_iweat_score(class_sentences, targets, pos_attrs, neg_attrs)
                iseat_score = calculate_iseat_score(class_sentences, targets, pos_attrs, neg_attrs)

                results[class_name] = {
                    'CEAT': ceat_score,
                    'I-WEAT': iweat_score,
                    'I-SEAT': iseat_score,
                    'sentence_count': len(class_sentences)
                }

    # Calculate IIBS
    all_sentences = df['Sentence'].tolist()
    iibs_score = calculate_iibs_score(all_sentences)

    # Calculate aggregate scores
    aggregate_scores = {'CEAT': [], 'I-WEAT': [], 'I-SEAT': []}
    for scores in results.values():
        aggregate_scores['CEAT'].append(abs(scores['CEAT']))
        aggregate_scores['I-WEAT'].append(abs(scores['I-WEAT']))
        aggregate_scores['I-SEAT'].append(abs(scores['I-SEAT']))

    final_scores = {
        'CEAT': np.mean(aggregate_scores['CEAT']) if aggregate_scores['CEAT'] else 0,
        'I-WEAT': np.mean(aggregate_scores['I-WEAT']) if aggregate_scores['I-WEAT'] else 0,
        'I-SEAT': np.mean(aggregate_scores['I-SEAT']) if aggregate_scores['I-SEAT'] else 0,
        'IIBS': iibs_score
    }

    return results, final_scores

# Example usage:
if __name__ == "__main__":
    # Analyze the corpus
    results, final_scores = analyze_corpus_bias('corpus.csv')

    print("=== FINAL BIAS METRICS ===")
    print(f"CEAT: {final_scores['CEAT']:.2f}")
    print(f"I-WEAT: {final_scores['I-WEAT']:.2f}")
    print(f"I-SEAT: {final_scores['I-SEAT']:.2f}")
    print(f"IIBS: {final_scores['IIBS']:.2f}")


Loaded corpus: 261 sentences, 21 classes
=== FINAL BIAS METRICS ===
CEAT: 2.00
I-WEAT: 2.10
I-SEAT: 1.80
IIBS: 0.50


In [7]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# -------------------
# Load RoBERTa encoder
# -------------------
model = SentenceTransformer("roberta-base")

# -------------------
# Anchor sets for embeddings
# -------------------

# CEAT concepts (tech + ethics related)
ceat_anchors = {
    "tech": ["technology", "innovation", "AI", "machine learning"],
    "ethics": ["ethics", "fairness", "justice", "responsibility"]
}

# I-WEAT attributes (example: gender-career bias)
iweat_anchors = {
    "male": ["man", "male", "he", "him", "boy"],
    "female": ["woman", "female", "she", "her", "girl"],
    "career": ["career", "corporation", "salary", "office", "promotion"],
    "family": ["family", "home", "children", "relatives", "caring"]
}

# I-SEAT sentence-level anchors
iseat_anchors = {
    "positive": ["hardworking", "capable", "intelligent", "skilled"],
    "negative": ["lazy", "incompetent", "ignorant", "weak"]
}

# IIBS intersectional anchors (race, gender, region, etc.)
iibs_anchors = [
    "black woman in tech",
    "asian man in leadership",
    "disabled student in education",
    "immigrant worker in healthcare",
    "elderly person in workplace"
]

# -------------------
# Helper: average similarity
# -------------------
def avg_cosine_similarity(embed, anchor_texts):
    anchor_embeds = model.encode(anchor_texts, convert_to_tensor=True, normalize_embeddings=True)
    sim = util.cos_sim(embed, anchor_embeds).cpu().numpy()
    return float(np.mean(sim))

# -------------------
# Metric functions
# -------------------
def calculate_ceat_score(sentence):
    sent_embed = model.encode(sentence, convert_to_tensor=True, normalize_embeddings=True)
    tech_sim = avg_cosine_similarity(sent_embed, ceat_anchors["tech"])
    ethics_sim = avg_cosine_similarity(sent_embed, ceat_anchors["ethics"])
    return round(tech_sim - ethics_sim, 4)

def calculate_iweat_score(sentence):
    sent_embed = model.encode(sentence, convert_to_tensor=True, normalize_embeddings=True)
    male_sim = avg_cosine_similarity(sent_embed, iweat_anchors["male"])
    female_sim = avg_cosine_similarity(sent_embed, iweat_anchors["female"])
    career_sim = avg_cosine_similarity(sent_embed, iweat_anchors["career"])
    family_sim = avg_cosine_similarity(sent_embed, iweat_anchors["family"])
    # Example: bias = (male-career + female-family) - (male-family + female-career)
    return round(((male_sim + career_sim) + (female_sim + family_sim)) - ((male_sim + family_sim) + (female_sim + career_sim)), 4)

def calculate_iseat_score(sentence):
    sent_embed = model.encode(sentence, convert_to_tensor=True, normalize_embeddings=True)
    pos_sim = avg_cosine_similarity(sent_embed, iseat_anchors["positive"])
    neg_sim = avg_cosine_similarity(sent_embed, iseat_anchors["negative"])
    return round(pos_sim - neg_sim, 4)

def calculate_iibs_score(sentence):
    sent_embed = model.encode(sentence, convert_to_tensor=True, normalize_embeddings=True)
    return round(avg_cosine_similarity(sent_embed, iibs_anchors), 4)

# -------------------
# Full analysis
# -------------------
def analyze_bias(sentences, output_csv="bias_scores_analysis_embeddings.csv"):
    results = []
    for sent in sentences:
        ceat = calculate_ceat_score(sent)
        iweat = calculate_iweat_score(sent)
        iseat = calculate_iseat_score(sent)
        iibs = calculate_iibs_score(sent)

        results.append({
            "Sentence": sent,
            "CEAT_Score": ceat,
            "i-WEAT_Score": iweat,
            "i-SEAT_Score": iseat,
            "IIBS_Score": iibs
        })

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    return df

# -------------------
# Example run
# -------------------
sentences = [
    "The young immigrant woman struggled to get recognition in the tech company.",
    "The disabled student faced challenges in accessing online education.",
    "The elderly man was seen as unfit for leadership roles.",
    "Women are often assumed to be less technical than men.",
    "The Asian engineer was overlooked in workplace discussions."
]

df = analyze_bias(sentences)
print(df)


Successfully loaded corpus with latin-1 encoding
Loaded 261 sentences across 21 classes

Calculating bias scores for 261 sentences...

✓ Results saved to: bias_scores_analysis_.csv
