In [101]:
# Import statements
import os
import numpy as np
import pandas as pd

from utils.ppmi import build_ppmi
from utils.retrofit import sharded_retrofit, join_shards
from utils.formats import load_hdf, save_hdf

In [102]:
### Test heuristics (Refactored for use with PPMI)

import numpy as np
import pandas as pd

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def same_score(word, attribute_set, ppmi_df):
    """Compute the mean cosine similarity between a word and an attribute set."""
    if word not in ppmi_df.index:
        return 0
    word_vec = ppmi_df.loc[word].values
    similarities = [
        cosine_similarity(word_vec, ppmi_df.loc[attr].values)
        for attr in attribute_set if attr in ppmi_df.index
    ]
    return np.mean(similarities) if similarities else 0

def delta_same(target_set, attribute_set_a, attribute_set_b, ppmi_df):
    """Compute the bias score between two attribute sets using SAME."""
    scores_a = [same_score(word, attribute_set_a, ppmi_df) for word in target_set if word in ppmi_df.index]
    scores_b = [same_score(word, attribute_set_b, ppmi_df) for word in target_set if word in ppmi_df.index]
    # print(scores_a)
    # print(scores_b)
    return np.mean(scores_a) - np.mean(scores_b) if scores_a and scores_b else 0

def compute_gender_direction(gender_pairs, gender_pairs2, ppmi_df):
    """Compute the gender direction based on word pairs."""
    differences = []
       # Try primary gender pairs first (c/en/man, c/en/woman)
    for male, female in gender_pairs:
        if male in ppmi_df.index and female in ppmi_df.index:
            diff = ppmi_df.loc[male].values - ppmi_df.loc[female].values
            differences.append(diff)
        else:
            print(f"Skipping pair ({male}, {female}) — one or both not in PPMI (primary).")

    # If none found, try secondary gender pairs (c/en/men, c/en/women)
    if not differences:
        for male, female in gender_pairs2:
            if male in ppmi_df.index and female in ppmi_df.index:
                diff = ppmi_df.loc[male].values - ppmi_df.loc[female].values
                differences.append(diff)
            else:
                print(f"Skipping pair ({male}, {female}) — one or both not in PPMI (fallback).")
    
    if not differences:
        raise ValueError("No valid gender pairs found in PPMI.")

    return np.mean(differences, axis=0)

def direct_bias(word, gender_direction, ppmi_df):
    """Compute the direct bias of a word with respect to gender direction."""
    if word not in ppmi_df.index:
        return 0
    return abs(cosine_similarity(ppmi_df.loc[word].values, gender_direction))

def direct_bias_wordlist(word_list, gender_dir, ppmi_df, label, output_path):
    results = []
    for word in word_list:
        if word in ppmi_df.index:
            bias = direct_bias(word, gender_dir, ppmi_df)
            results.append({"word": word, "bias": bias, "group": label})
        else:
            results.append({"word": word, "bias": None, "group": None})  # Word not in PPMI

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    return df

def to_conceptnet_uri(word):
    return "/c/en/" + word.strip().lower().replace(" ", "_")



In [103]:
### Variable declarations
ppmi_df = pd.read_hdf('data/conceptnet_api/hdf/test.hdf')
ppmi_df.head()
# print(ppmi_df.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
/c/en/help_child,5.379521e-16,-1.3e-05,-2e-06,5.409838e-06,4.0223e-15,-1.130818e-15,5.853341e-17,9.179294000000001e-17,-4.842451e-16,1.788475e-15,2.421133e-15,-2.488438e-16,-1.801727e-16,4.414657e-16,1.054125e-16,-9.243286e-16,-1.304739e-15,1.5e-05,-5.219719e-13,-6.876059e-13,3.019614e-16,2.404599e-16,-7.063852000000001e-17,1.68828e-15,6.626313e-16,1.491595e-16,-5.3268940000000007e-17,1.248755e-16,5.099102e-16,1.274973e-15,5.814638e-17,9.278852e-16,5.973974e-16,9.020796e-16,2.163206e-16,-2.507475e-16,-1.99542e-16,-1.024341e-16,-5e-06,0.000502,...,-0.006798,-0.067762,0.000351,0.117447,0.028399,0.107606,0.028905,0.050568,0.041218,-0.076765,-0.010367,-0.001069,-0.015993,-0.01994,0.120977,7.6e-05,-0.173459,0.001825,0.003134,-0.037326,0.066751,0.000539,4e-06,-4.064832e-07,5e-06,0.000799,0.000156,-0.002607,-0.059551,0.004967,0.002325,-0.004004,0.000241,-0.025836,-0.04182,0.005383,0.001438,-0.000221,-0.000418,-0.005307
/c/en/adult,-5.947637e-16,-3e-05,2e-06,4.268783e-06,4.570441e-15,-2.390922e-15,9.802262e-16,-1.523202e-15,-9.785777e-16,2.679285e-15,2.011361e-15,4.595419e-16,2.993147e-16,-1.408068e-15,-4.691903e-18,-6.144958000000001e-17,-4.335018e-16,2e-05,-6.948072e-13,-9.187887e-13,6.142368e-16,-8.152512e-16,5.585749e-16,3.4769e-15,1.395646e-15,-3.802114e-16,5.13467e-16,1.751211e-16,1.616799e-15,2.437179e-15,4.847232e-16,1.23023e-15,1.860467e-15,2.401617e-15,3.251105e-16,-6.257412e-16,-5.789931e-16,9.628434e-16,0.000158,0.004239,...,-0.028524,-0.286402,0.002562,0.500944,0.123374,0.468505,0.126112,0.22105,0.185074,-0.35448,-0.047849,-0.004924,-0.070011,-0.09234,0.568099,0.000356,-0.825742,0.008801,0.015262,-0.181795,0.33276,0.002741,2.9e-05,-1.303094e-06,2.4e-05,0.004753,0.000955,-0.015994,-0.443037,0.036631,0.018061,-0.0304,0.001878,-0.202185,-0.345947,0.045245,0.012511,-0.001947,-0.003833,-0.052088
/c/en/man,3.273319e-16,-1.6e-05,4e-06,-8.463431e-07,-8.760094e-16,1.120815e-15,-7.305838e-16,-1.70874e-15,5.022793e-16,-7.084706e-16,1.520039e-15,4.69197e-16,-1.022493e-15,-1.69747e-15,-1.314639e-16,5.744029e-16,4.268142e-17,3e-06,-1.076487e-13,-1.528285e-13,7.064738e-16,-4.197134e-16,1.512642e-15,5.542464e-15,1.446664e-15,9.524747e-16,-7.140052e-16,-2.909484e-16,-6.448216e-16,-1.274253e-15,2.495376e-16,-1.151763e-15,-1.749162e-16,-1.184899e-15,2.05084e-16,8.106973e-16,1.033045e-15,5.861342e-18,6.6e-05,0.000642,...,-0.08789,-1.169623,-0.223616,1.733351,0.840813,0.657052,0.164061,0.804747,0.482572,-1.194194,-0.207881,-0.020511,0.02701,-0.180812,1.282072,0.000204,-1.782536,0.022816,0.012983,-0.140894,0.790048,0.010274,0.001142,0.0001116948,-0.000178,0.037078,-0.000967,0.006626,-0.902754,-0.31321,0.155744,-0.142276,0.001892,-0.75451,-0.660803,0.098269,0.024405,-0.007568,0.03928,-0.059049
/c/en/sign_contract,-8.984204000000001e-17,-9e-06,-3e-06,5.824575e-06,4.030767e-15,-1.090103e-15,-1.204847e-16,3.375345e-16,-3.556234e-16,1.765867e-15,2.291911e-15,-3.378808e-16,-9.213008e-17,6.516395e-16,4.373117e-16,-1.143419e-15,-1.284033e-15,1.4e-05,-4.885095e-13,-6.432432e-13,6.475981e-16,1.931887e-16,3.096021e-16,7.561739e-16,1.647932e-16,6.073674000000001e-17,1.248562e-16,-2.244604e-16,4.071271e-16,4.59092e-16,2.1905440000000003e-17,5.391275e-16,4.461709e-16,7.747315e-16,2.068224e-16,5.935970000000001e-17,5.277909e-17,5.180302e-16,-4.7e-05,-0.000428,...,-0.006695,-0.06678,0.000272,0.115607,0.027978,0.10605,0.028479,0.049844,0.040521,-0.075,-0.010153,-0.001048,-0.015964,-0.019645,0.119336,7.5e-05,-0.170995,0.001797,0.003085,-0.036742,0.065712,0.00053,3e-06,-4.508414e-07,5e-06,0.000783,0.000154,-0.002568,-0.058443,0.004904,0.002268,-0.003947,0.000239,-0.025528,-0.041064,0.005288,0.001411,-0.000217,-0.000418,-0.005244
/c/en/dress_herself,-8.856595e-17,-9e-06,-3e-06,5.824575e-06,4.199817e-15,-1.090356e-15,1.005326e-16,1.788506e-16,4.7883530000000006e-17,1.832971e-15,2.359555e-15,-7.076026e-16,-3.601943e-17,8.3692e-16,3.974509e-16,-1.328338e-15,-1.425896e-15,1.4e-05,-4.882903e-13,-6.431086e-13,5.646216e-16,2.435448e-16,2.320973e-16,8.18408e-16,2.326295e-16,-1.060871e-16,1.77471e-16,-4.0348830000000006e-17,4.200285e-16,6.352547e-16,-1.573358e-17,6.188916e-16,3.832302e-16,6.544357e-16,2.99777e-16,7.610556e-17,-1.174043e-16,3.359658e-16,-4.7e-05,-0.000428,...,-0.006695,-0.06678,0.000272,0.115607,0.027978,0.10605,0.028479,0.049844,0.040521,-0.075,-0.010153,-0.001048,-0.015964,-0.019645,0.119336,7.5e-05,-0.170995,0.001797,0.003085,-0.036742,0.065712,0.00053,3e-06,-4.508414e-07,5e-06,0.000783,0.000154,-0.002568,-0.058443,0.004904,0.002268,-0.003947,0.000239,-0.025528,-0.041064,0.005288,0.001411,-0.000217,-0.000418,-0.005244


In [104]:
### Reading from csv
female_df = pd.read_csv("data/gendered/gender_f_cleaned.csv", header=None)
male_df = pd.read_csv("data/gendered/gender_m_cleaned.csv", header=None)
neutral_df = pd.read_csv("data/gender_neutral/gender_n_cleaned.csv", header=None)

female_words_target = female_df[0].dropna().apply(to_conceptnet_uri).tolist()
male_words_target = male_df[0].dropna().apply(to_conceptnet_uri).tolist()
neutral_words_target = neutral_df[0].dropna().apply(to_conceptnet_uri).tolist()

In [105]:
### Calculation of heuristics (Benchmark set as this has no edits done)
gender_pairs = [("/c/en/man", "/c/en/woman")]
gender_pairs2 = [("/c/en/men", "/c/en/women")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, gender_pairs2, ppmi_df)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/benchmark"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df, "female", "data/heuristic/directBias/benchmark/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df, "male", "data/heuristic/directBias/benchmark/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df, "neutral", "data/heuristic/directBias/benchmark/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-5.22156241e-16 -9.12611510e-05  2.26672453e-05 -5.91807779e-06
 -9.24422121e-16]
SAME bias score neutral: 0.020677905963184078
SAME bias score male: 0.26932468067546717
SAME bias score female: -0.5780757625079354
Direct bias for '/c/en/doctor': 0.6830964319194655


In [106]:
### Hypothesis 1: Does incorporating graph structure into word embeddings reduce bias?
sharded_retrofit(
    dense_hdf_filename="data/conceptnet_api/hdf/test.hdf",
    conceptnet_filename="data/conceptnet_api/csv/edge_extract.csv",
    output_filename="data/conceptnet_api/retrofit/test_retrofitted"
)

join_shards(output_filename="data/conceptnet_api/retrofit/test_retrofitted", nshards=8, sort=False)

retrofitted_ppmi = pd.read_hdf("data/conceptnet_api/retrofit/test_retrofitted")

gender_pairs = [("/c/en/man", "/c/en/woman")]
gender_pairs2 = [("/c/en/men", "/c/en/women")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, gender_pairs2, retrofitted_ppmi)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, retrofitted_ppmi)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, retrofitted_ppmi)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, retrofitted_ppmi)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, retrofitted_ppmi)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/hypothesis1"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, retrofitted_ppmi, "female", "data/heuristic/directBias/hypothesis1/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, retrofitted_ppmi, "male", "data/heuristic/directBias/hypothesis1/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, retrofitted_ppmi, "neutral", "data/heuristic/directBias/hypothesis1/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-7.2579083e-17 -1.9953368e-05 -1.9663707e-06 -1.4625811e-05
 -4.0829847e-17]
SAME bias score neutral: nan
SAME bias score male: nan
SAME bias score female: -0.42026755
Direct bias for '/c/en/doctor': 0.5245204


  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


In [107]:
#### Code above was functionality testing, code below demonstrates actual work tried
# Variation 1: Small dataset + Filter (Antonyms)
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar1.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
ppmi_df_var1 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar1.csv", ndim=300)
save_hdf(ppmi_df_var1, filename='data/conceptnet_api/hdf/testVar1.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
gender_pairs2 = [("/c/en/men", "/c/en/women")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, gender_pairs2, ppmi_df_var1)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var1)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/variation1"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var1, "female", "data/heuristic/directBias/variation1/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var1, "male", "data/heuristic/directBias/variation1/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var1, "neutral", "data/heuristic/directBias/variation1/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-1.34992619e-15 -3.14803562e-15 -2.66589376e-05 -2.18844882e-05
  3.24484993e-06]
SAME bias score neutral: 0.020635123338666708
SAME bias score male: 0.2689158195800159
SAME bias score female: -0.5813703747456964
Direct bias for '/c/en/doctor': 0.6820667020592693


In [108]:
# Variation 2: Small dataset + Filter ("/r/Antonym", "/r/NotDesires", "/r/Desires", "/r/ObstructedBy", "/r/MannerOf")
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar2.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
ppmi_df_var2 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar2.csv", ndim=300)
save_hdf(ppmi_df_var2, filename='data/conceptnet_api/hdf/testVar2.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
gender_pairs2 = [("/c/en/men", "/c/en/women")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, gender_pairs2, ppmi_df_var2)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var2)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/variation2"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var2, "female", "data/heuristic/directBias/variation2/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var2, "male", "data/heuristic/directBias/variation2/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var2, "neutral", "data/heuristic/directBias/variation2/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [ 3.66054305e-15 -1.09158502e-15  6.77944930e-16 -1.81073787e-15
  1.39121626e-16]
SAME bias score neutral: 0.019407439451583946
SAME bias score male: 0.2681759497124818
SAME bias score female: -0.5816567944429101
Direct bias for '/c/en/doctor': 0.6483519823562606


In [109]:
def check_vocab_coverage(word_list, ppmi_df):
    present = [w for w in word_list if w in ppmi_df.index]
    missing = [w for w in word_list if w not in ppmi_df.index]
    print(f"✔ Found {len(present)} words, ❌ Missing {len(missing)} words")
    return present, missing

print("Neutral coverage:")
_, _ = check_vocab_coverage(neutral_words_target, ppmi_df_var2)

print("Male coverage:")
_, _ = check_vocab_coverage(male_words_target, ppmi_df_var2)

print("Female coverage:")
_, _ = check_vocab_coverage(female_words_target, ppmi_df_var2)

print(ppmi_df_var2.index)

## From here, realised that we are not getting alot of coverage.

Neutral coverage:
✔ Found 115 words, ❌ Missing 5238 words
Male coverage:
✔ Found 29 words, ❌ Missing 494 words
Female coverage:
✔ Found 15 words, ❌ Missing 352 words
Index(['/c/en/help_child', '/c/en/adult', '/c/en/man', '/c/en/sign_contract',
       '/c/en/dress_herself', '/c/en/sheep', '/c/en/adult/n/wn/person',
       '/c/en/fascist/n/wn/person', '/c/en/man/n/wn/person',
       '/c/en/stay_at_home/n/wn/person',
       ...
       '/c/en/quarryman/n/wn/person', '/c/en/slave/n/wn/person',
       '/c/en/tier/n/wn/person', '/c/en/political_officer/n',
       '/c/en/employable/n/wn/person', '/c/en/throwster/n/wn/person',
       '/c/en/freelance/n/wn/person', '/c/en/skidder/n/wn/person',
       '/c/en/solderer/n/wn/person', '/c/en/bleacher/n/wn/person'],
      dtype='object', length=3958)


In [110]:
# Variation 3: Small dataset + Filter (Unidirectional edges)
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar3.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
# Edit the Var numbers below. E.g. edge_extractVar<NUMBER> and testVar<NUMBER>
ppmi_df_var3 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar3.csv", ndim=300)
save_hdf(ppmi_df_var3, filename='data/conceptnet_api/hdf/testVar3.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
gender_pairs2 = [("/c/en/men", "/c/en/women")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, gender_pairs2, ppmi_df_var3)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var3)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var3)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var3)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var3)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/variation3"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var3, "female", "data/heuristic/directBias/variation3/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var3, "male", "data/heuristic/directBias/variation3/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var3, "neutral", "data/heuristic/directBias/variation3/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-1.07127059e-17  1.20317390e-15 -7.71549783e-16 -3.44391595e-16
  2.96811941e-05]
SAME bias score neutral: 0.02209098032974399
SAME bias score male: 0.22033426697084874
SAME bias score female: -0.5469579272834754
Direct bias for '/c/en/doctor': 0.6910107221797888


In [111]:
### Hypothesis 2:
## Scraping from ConceptNet
import filter_ablation
import importlib
import conceptnet_api_scraper
importlib.reload(conceptnet_api_scraper)
importlib.reload(filter_ablation)
from conceptnet_api_scraper import parse_response

JSON_PATH = os.path.join(os.getcwd(), "data", "conceptnet_api", "json")
CSV_PATH = os.path.join(os.getcwd(), "data", "conceptnet_api", "csv")

for filter_name, filter in filter_ablation.get_all_filter_chains().items(): 
  keywords_df = parse_response(input_folder=JSON_PATH, output_folder=CSV_PATH, edge_filter=filter)
  keywords_df.to_csv(f'{CSV_PATH}/edge_extract_{filter_name}.csv', index=False)

100%|██████████| 67/67 [00:00<00:00, 917.13it/s]
100%|██████████| 67/67 [00:00<00:00, 963.78it/s]
100%|██████████| 67/67 [00:00<00:00, 938.89it/s]
100%|██████████| 67/67 [00:00<00:00, 955.83it/s]
100%|██████████| 67/67 [00:00<00:00, 959.48it/s]
100%|██████████| 67/67 [00:00<00:00, 1004.28it/s]
100%|██████████| 67/67 [00:00<00:00, 997.11it/s]
100%|██████████| 67/67 [00:00<00:00, 983.96it/s]
100%|██████████| 67/67 [00:00<00:00, 932.69it/s]
100%|██████████| 67/67 [00:00<00:00, 960.36it/s]
100%|██████████| 67/67 [00:00<00:00, 1036.73it/s]
100%|██████████| 67/67 [00:00<00:00, 944.16it/s]
100%|██████████| 67/67 [00:00<00:00, 927.65it/s]
100%|██████████| 67/67 [00:00<00:00, 875.62it/s]
100%|██████████| 67/67 [00:00<00:00, 884.83it/s]
100%|██████████| 67/67 [00:00<00:00, 816.71it/s]
100%|██████████| 67/67 [00:00<00:00, 886.78it/s]
100%|██████████| 67/67 [00:00<00:00, 883.78it/s]
100%|██████████| 67/67 [00:00<00:00, 470.36it/s]
100%|██████████| 67/67 [00:00<00:00, 964.81it/s]
100%|██████████| 6

In [112]:
### Hypothesis 2: 
## Generating the heuristic scores
import os
import pandas as pd
from utils.retrofit import sharded_retrofit, join_shards

# === Paths ===
CSV_PATH = "data/conceptnet_api/csv"
HDF_PATH = "data/conceptnet_api/hdf"
RETROFIT_PATH = "data/conceptnet_api/retrofit"
BIAS_OUTPUT_ROOT = "data/heuristic/directBias"
EVAL_OUTPUT = "data/conceptnet_api/eval"
os.makedirs(HDF_PATH, exist_ok=True)
os.makedirs(RETROFIT_PATH, exist_ok=True)
os.makedirs(BIAS_OUTPUT_ROOT, exist_ok=True)
os.makedirs(EVAL_OUTPUT, exist_ok=True)

# === Bias Config ===
gender_pairs = [("/c/en/man", "/c/en/woman")]
gender_pairs2 = [("/c/en/men", "/c/en/women")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# === Target Word Lists ===

csv_files = [f for f in os.listdir(CSV_PATH) if f.startswith("edge_extract_") and f.endswith(".csv")]

for file in csv_files:
    filter_name = file.replace("edge_extract_", "").replace(".csv", "")
    print(f"\n🚀 Processing filter variation: {filter_name}")

    # Step 1: Build PPMI
    input_csv = os.path.join(CSV_PATH, file)
    dense_hdf_path = os.path.join(HDF_PATH, f"test_{filter_name}.hdf")
    ppmi_df = build_ppmi(conceptnet_filename=input_csv, ndim=128)
    save_hdf(ppmi_df, filename=dense_hdf_path)

    # Step 2: Retrofitting
    retrofit_prefix = os.path.join(RETROFIT_PATH, f"test_retrofitted_{filter_name}")
    sharded_retrofit(
        dense_hdf_filename=dense_hdf_path,
        conceptnet_filename=input_csv,
        output_filename=retrofit_prefix
    )

    # Step 3: Join shards
    join_shards(output_filename=retrofit_prefix, nshards=8, sort=False)

    # Step 4: Load retrofitted PPMI
    retrofitted_hdf = retrofit_prefix  # joined result has no extension
    if not os.path.exists(retrofitted_hdf):
        print(f"⚠️ Skipping {filter_name}, retrofitted file not found")
        continue

    ppmi_df = pd.read_hdf(retrofitted_hdf)

    # Step 5: Compute bias heuristics
    gender_dir = compute_gender_direction(gender_pairs, gender_pairs2, ppmi_df)

    bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df)
    bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df)
    bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df)
    bias_direct_doctor = direct_bias("/c/en/doctor", gender_dir, ppmi_df)

    # Step 6: Save direct bias scores
    output_dir = os.path.join(BIAS_OUTPUT_ROOT, f"variation_{filter_name}")
    os.makedirs(output_dir, exist_ok=True)

    direct_bias_wordlist(female_words_target, gender_dir, ppmi_df, "female", os.path.join(output_dir, "female_bias.csv"))
    direct_bias_wordlist(male_words_target, gender_dir, ppmi_df, "male", os.path.join(output_dir, "male_bias.csv"))
    direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df, "neutral", os.path.join(output_dir, "neutral_bias.csv"))

    # Step 7: Summary print
    print("📊 SAME Bias (Neutral):", bias_same_neutral)
    print("📊 SAME Bias (Male):", bias_same_male)
    print("📊 SAME Bias (Female):", bias_same_female)
    print("📊 Direct Bias ('/c/en/doctor'):", bias_direct_doctor)
    print(f"✅ Completed variation: {filter_name}")

    # Step 8: Save results summary to a .txt file
    summary_path = os.path.join(EVAL_OUTPUT, "eval_summary.txt")
    with open(summary_path, "a") as f:
        f.write(f"===== Bias Results for Filter: {filter_name} =====\n")
        f.write(f"SAME Bias (Neutral): {bias_same_neutral:.4f}\n")
        f.write(f"SAME Bias (Male):    {bias_same_male:.4f}\n")
        f.write(f"SAME Bias (Female):  {bias_same_female:.4f}\n")
        f.write(f"Direct Bias ('/c/en/doctor'): {bias_direct_doctor:.4f}\n")
        f.write("--------------------------------------------------\n\n")



🚀 Processing filter variation: baseline
📊 SAME Bias (Neutral): 0.031613678
📊 SAME Bias (Male): 0.25584662
📊 SAME Bias (Female): -0.5171609
📊 Direct Bias ('/c/en/doctor'): 0.45210114
✅ Completed variation: baseline

🚀 Processing filter variation: baseline_lenient
📊 SAME Bias (Neutral): 0.03150226
📊 SAME Bias (Male): 0.22880755
📊 SAME Bias (Female): -0.35601062
📊 Direct Bias ('/c/en/doctor'): 0.47868145
✅ Completed variation: baseline_lenient

🚀 Processing filter variation: baseline_strict
📊 SAME Bias (Neutral): 0.03109293
📊 SAME Bias (Male): 0.37316307
📊 SAME Bias (Female): -0.4226548
📊 Direct Bias ('/c/en/doctor'): 0.48331773
✅ Completed variation: baseline_strict

🚀 Processing filter variation: comprehensive
📊 SAME Bias (Neutral): 0.05553581
📊 SAME Bias (Male): 0.21294
📊 SAME Bias (Female): -0.42436472
📊 Direct Bias ('/c/en/doctor'): 0.50855494
✅ Completed variation: comprehensive

🚀 Processing filter variation: comprehensive_high_quality
📊 SAME Bias (Neutral): 0.05746886
📊 SAME Bias