In [82]:
# Import statements
import os
import numpy as np
import pandas as pd

from utils.ppmi import build_ppmi
from utils.formats import load_hdf, save_hdf

In [83]:
### Test heuristics (Refactored for use with PPMI)

import numpy as np
import pandas as pd

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def same_score(word, attribute_set, ppmi_df):
    """Compute the mean cosine similarity between a word and an attribute set."""
    if word not in ppmi_df.index:
        return 0
    word_vec = ppmi_df.loc[word].values
    similarities = [
        cosine_similarity(word_vec, ppmi_df.loc[attr].values)
        for attr in attribute_set if attr in ppmi_df.index
    ]
    return np.mean(similarities) if similarities else 0

def delta_same(target_set, attribute_set_a, attribute_set_b, ppmi_df):
    """Compute the bias score between two attribute sets using SAME."""
    scores_a = [same_score(word, attribute_set_a, ppmi_df) for word in target_set if word in ppmi_df.index]
    scores_b = [same_score(word, attribute_set_b, ppmi_df) for word in target_set if word in ppmi_df.index]
    # print(scores_a)
    # print(scores_b)
    return np.mean(scores_a) - np.mean(scores_b) if scores_a and scores_b else 0

def compute_gender_direction(gender_pairs, ppmi_df):
    """Compute the gender direction based on word pairs."""
    differences = []
    for male, female in gender_pairs:
        if male in ppmi_df.index and female in ppmi_df.index:
            diff = ppmi_df.loc[male].values - ppmi_df.loc[female].values
            differences.append(diff)
        else:
            print(f"Skipping pair ({male}, {female}) — one or both not in PPMI.")
    
    if not differences:
        raise ValueError("No valid gender pairs found in PPMI.")

    return np.mean(differences, axis=0)

def direct_bias(word, gender_direction, ppmi_df):
    """Compute the direct bias of a word with respect to gender direction."""
    if word not in ppmi_df.index:
        return None
    return abs(cosine_similarity(ppmi_df.loc[word].values, gender_direction))

def direct_bias_wordlist(word_list, gender_dir, ppmi_df, label, output_path):
    results = []
    for word in word_list:
        if word in ppmi_df.index:
            bias = direct_bias(word, gender_dir, ppmi_df)
            results.append({"word": word, "bias": bias, "group": label})
        else:
            results.append({"word": word, "bias": None, "group": None})  # Word not in PPMI

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    return df

def to_conceptnet_uri(word):
    return "/c/en/" + word.strip().lower().replace(" ", "_")



In [84]:
### Variable declarations
ppmi_df = pd.read_hdf('data/conceptnet_api/hdf/test.hdf')
# ppmi_df.head()
# print(ppmi_df.shape)

In [85]:
### Reading from csv
female_df = pd.read_csv("data/gendered/gender_f_cleaned.csv", header=None)
male_df = pd.read_csv("data/gendered/gender_m_cleaned.csv", header=None)
neutral_df = pd.read_csv("data/gender_neutral/gender_neutral.csv", header=None)

female_words_target = female_df[0].dropna().apply(to_conceptnet_uri).tolist()
male_words_target = male_df[0].dropna().apply(to_conceptnet_uri).tolist()
neutral_words_target = neutral_df[0].dropna().apply(to_conceptnet_uri).tolist()

In [86]:
### Calculation of heuristics (Benchmark set as this has no edits done)
gender_pairs = [("/c/en/man", "/c/en/woman")]
target_set = ["/c/en/doctor", "/c/en/nurse", "/c/en/engineer", "/c/en/teacher"]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, ppmi_df)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/benchmark"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df, "female", "data/heuristic/directBias/benchmark/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var1, "male", "data/heuristic/directBias/benchmark/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var1, "neutral", "data/heuristic/directBias/benchmark/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-5.22156241e-16 -9.12611510e-05  2.26672453e-05 -5.91807779e-06
 -9.24422121e-16]
SAME bias score neutral: -0.0005088940741486726
SAME bias score male: 0.26932468067546717
SAME bias score female: -0.5780757625079354
Direct bias for '/c/en/doctor': 0.6830964319194655


In [87]:
#### Code above was functionality testing, code below demonstrates actual work tried
# Variation 1: Small dataset + Filter (Antonyms)
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar1.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
ppmi_df_var1 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar1.csv", ndim=300)
save_hdf(ppmi_df_var1, filename='data/conceptnet_api/hdf/testVar1.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, ppmi_df_var1)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var1)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/variation1"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var1, "female", "data/heuristic/directBias/variation1/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var1, "male", "data/heuristic/directBias/variation1/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var1, "neutral", "data/heuristic/directBias/variation1/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-2.58369379e-15 -3.42773790e-15 -2.66589376e-05 -2.18844882e-05
 -3.24484992e-06]
SAME bias score neutral: -0.0004291131460867348
SAME bias score male: 0.2689158195800157
SAME bias score female: -0.5813703747456963
Direct bias for '/c/en/doctor': 0.6820667020592693


In [88]:
# Variation 2: Small dataset + Filter ("/r/Antonym", "/r/NotDesires", "/r/Desires", "/r/ObstructedBy", "/r/MannerOf")
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar2.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
ppmi_df_var2 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar2.csv", ndim=300)
save_hdf(ppmi_df_var2, filename='data/conceptnet_api/hdf/testVar2.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, ppmi_df_var2)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var2)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/variation2"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var2, "female", "data/heuristic/directBias/variation2/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var2, "male", "data/heuristic/directBias/variation2/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var2, "neutral", "data/heuristic/directBias/variation2/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-3.58110229e-16 -4.51613485e-15  1.84086028e-15  4.83755087e-15
  3.60902266e-15]
SAME bias score neutral: -0.0005454765421955589
SAME bias score male: 0.26817594971248154
SAME bias score female: -0.5816567944429107
Direct bias for '/c/en/doctor': 0.6483519823562611


In [89]:
def check_vocab_coverage(word_list, ppmi_df):
    present = [w for w in word_list if w in ppmi_df.index]
    missing = [w for w in word_list if w not in ppmi_df.index]
    print(f"✔ Found {len(present)} words, ❌ Missing {len(missing)} words")
    return present, missing

print("Neutral coverage:")
_, _ = check_vocab_coverage(neutral_words_target, ppmi_df_var2)

print("Male coverage:")
_, _ = check_vocab_coverage(male_words_target, ppmi_df_var2)

print("Female coverage:")
_, _ = check_vocab_coverage(female_words_target, ppmi_df_var2)

print(ppmi_df_var2.index)

## From here, realised that we are not getting alot of coverage.

Neutral coverage:
✔ Found 57 words, ❌ Missing 17 words
Male coverage:
✔ Found 29 words, ❌ Missing 494 words
Female coverage:
✔ Found 15 words, ❌ Missing 352 words
Index(['/c/en/help_child', '/c/en/adult', '/c/en/man', '/c/en/sign_contract',
       '/c/en/dress_herself', '/c/en/sheep', '/c/en/adult/n/wn/person',
       '/c/en/fascist/n/wn/person', '/c/en/man/n/wn/person',
       '/c/en/stay_at_home/n/wn/person',
       ...
       '/c/en/quarryman/n/wn/person', '/c/en/slave/n/wn/person',
       '/c/en/tier/n/wn/person', '/c/en/political_officer/n',
       '/c/en/employable/n/wn/person', '/c/en/throwster/n/wn/person',
       '/c/en/freelance/n/wn/person', '/c/en/skidder/n/wn/person',
       '/c/en/solderer/n/wn/person', '/c/en/bleacher/n/wn/person'],
      dtype='object', length=3958)


In [90]:
# Variation 3: Small dataset + Filter (Unidirectional edges)
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar3.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
# Edit the Var numbers below. E.g. edge_extractVar<NUMBER> and testVar<NUMBER>
ppmi_df_var3 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar3.csv", ndim=300)
save_hdf(ppmi_df_var3, filename='data/conceptnet_api/hdf/testVar3.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, ppmi_df_var3)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var3)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var3)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var3)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var3)
# Saving direct bias to csv 
output_dir = "data/heuristic/directBias/variation3"
os.makedirs(output_dir, exist_ok=True)
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var3, "female", "data/heuristic/directBias/variation3/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var3, "male", "data/heuristic/directBias/variation3/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var3, "neutral", "data/heuristic/directBias/variation3/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-1.48005486e-15  1.40020227e-14  8.57734504e-16  4.60818214e-15
  7.30527111e-15]
SAME bias score neutral: 0.00016561382623995688
SAME bias score male: 0.2811464336126861
SAME bias score female: -0.5822122091100196
Direct bias for '/c/en/doctor': 0.6898718438347983
