In [34]:
# Import statements
import numpy as np
import pandas as pd

from utils.ppmi import build_ppmi
from utils.formats import load_hdf, save_hdf

In [None]:
### Test heuristics (Refactored for use with PPMI)

import numpy as np
import pandas as pd

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def same_score(word, attribute_set, ppmi_df):
    """Compute the mean cosine similarity between a word and an attribute set."""
    if word not in ppmi_df.index:
        return 0
    word_vec = ppmi_df.loc[word].values
    similarities = [
        cosine_similarity(word_vec, ppmi_df.loc[attr].values)
        for attr in attribute_set if attr in ppmi_df.index
    ]
    return np.mean(similarities) if similarities else 0

def delta_same(target_set, attribute_set_a, attribute_set_b, ppmi_df):
    """Compute the bias score between two attribute sets using SAME."""
    scores_a = [same_score(word, attribute_set_a, ppmi_df) for word in target_set if word in ppmi_df.index]
    scores_b = [same_score(word, attribute_set_b, ppmi_df) for word in target_set if word in ppmi_df.index]
    # print(scores_a)
    # print(scores_b)
    return np.mean(scores_a) - np.mean(scores_b) if scores_a and scores_b else 0

def compute_gender_direction(gender_pairs, ppmi_df):
    """Compute the gender direction based on word pairs."""
    differences = []
    for male, female in gender_pairs:
        if male in ppmi_df.index and female in ppmi_df.index:
            diff = ppmi_df.loc[male].values - ppmi_df.loc[female].values
            differences.append(diff)
        else:
            print(f"Skipping pair ({male}, {female}) — one or both not in PPMI.")
    
    if not differences:
        raise ValueError("No valid gender pairs found in PPMI.")

    return np.mean(differences, axis=0)

def direct_bias(word, gender_direction, ppmi_df):
    """Compute the direct bias of a word with respect to gender direction."""
    if word not in ppmi_df.index:
        return None
    return abs(cosine_similarity(ppmi_df.loc[word].values, gender_direction))

def direct_bias_wordlist(word_list, gender_dir, ppmi_df, label, output_path):
    results = []
    for word in word_list:
        if word in ppmi_df.index:
            bias = direct_bias(word, gender_dir, ppmi_df)
            results.append({"word": word, "bias": bias, "group": label})
        else:
            results.append({"word": word, "bias": None, "group": None})  # Word not in PPMI

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    return df

def to_conceptnet_uri(word):
    return "/c/en/" + word.strip().lower().replace(" ", "_")



In [None]:
### Variable declarations
ppmi_df = pd.read_hdf('data/conceptnet_api/hdf/test.hdf')
# ppmi_df.head()
# print(ppmi_df.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/help_child,5.379521e-16,-1.3e-05,-2e-06,5.409838e-06,4.0223e-15,-1.130818e-15,5.853341e-17,9.179294000000001e-17,-4.842451e-16,1.788475e-15,...,0.002325,-0.004004,0.000241,-0.025836,-0.04182,0.005383,0.001438,-0.000221,-0.000418,-0.005307
/c/en/adult,-5.947637e-16,-3e-05,2e-06,4.268783e-06,4.570441e-15,-2.390922e-15,9.802262e-16,-1.523202e-15,-9.785777e-16,2.679285e-15,...,0.018061,-0.0304,0.001878,-0.202185,-0.345947,0.045245,0.012511,-0.001947,-0.003833,-0.052088
/c/en/man,3.273319e-16,-1.6e-05,4e-06,-8.463431e-07,-8.760094e-16,1.120815e-15,-7.305838e-16,-1.70874e-15,5.022793e-16,-7.084706e-16,...,0.155744,-0.142276,0.001892,-0.75451,-0.660803,0.098269,0.024405,-0.007568,0.03928,-0.059049
/c/en/sign_contract,-8.984204000000001e-17,-9e-06,-3e-06,5.824575e-06,4.030767e-15,-1.090103e-15,-1.204847e-16,3.375345e-16,-3.556234e-16,1.765867e-15,...,0.002268,-0.003947,0.000239,-0.025528,-0.041064,0.005288,0.001411,-0.000217,-0.000418,-0.005244
/c/en/dress_herself,-8.856595e-17,-9e-06,-3e-06,5.824575e-06,4.199817e-15,-1.090356e-15,1.005326e-16,1.788506e-16,4.7883530000000006e-17,1.832971e-15,...,0.002268,-0.003947,0.000239,-0.025528,-0.041064,0.005288,0.001411,-0.000217,-0.000418,-0.005244


In [37]:
### Calculation of heuristics
gender_pairs = [("/c/en/man", "/c/en/woman")]
target_set = ["/c/en/doctor", "/c/en/nurse", "/c/en/engineer", "/c/en/teacher"]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, ppmi_df)
bias_same = delta_same(target_set, attribute_set_a, attribute_set_b, ppmi_df)
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df)

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score:", bias_same)
print("Direct bias for '/c/en/doctor':", bias_direct)

Gender direction (preview): [-5.22156241e-16 -9.12611510e-05  2.26672453e-05 -5.91807779e-06
 -9.24422121e-16]
SAME bias score: 0.9663649438345743
Direct bias for '/c/en/doctor': 0.6830964319194655


In [38]:
### Reading from csv
female_df = pd.read_csv("data/gendered/gender_f_cleaned.csv", header=None)
male_df = pd.read_csv("data/gendered/gender_m_cleaned.csv", header=None)
neutral_df = pd.read_csv("data/gender_neutral/gender_neutral.csv", header=None)

female_words_target = female_df[0].dropna().apply(to_conceptnet_uri).tolist()
male_words_target = male_df[0].dropna().apply(to_conceptnet_uri).tolist()
neutral_words_target = neutral_df[0].dropna().apply(to_conceptnet_uri).tolist()

In [65]:
#### Code above was functionality testing, code below demonstrates actual work tried
# Variation 1: Small dataset + Filter (Antonyms)
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar1.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
ppmi_df_var1 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar1.csv", ndim=300)
save_hdf(ppmi_df_var1, filename='data/conceptnet_api/hdf/testVar1.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, ppmi_df_var1)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var1)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var1)
# Saving direct bias to csv 
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var1, "female", "data/heuristic/directBias/variation1/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var1, "male", "data/heuristic/directBias/variation1/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var1, "neutral", "data/heuristic/directBias/variation1/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

[np.float64(0.981021723924711), np.float64(-0.010291063155463053), np.float64(0.06681697976172381), np.float64(0.01093289687859995), np.float64(0.007449293756679736), np.float64(6.000808562054298e-05), np.float64(6.79357521864203e-17), np.float64(-0.0019131348698343388), np.float64(-0.00023730728910182668), np.float64(0.00021899232992708044), np.float64(-0.0010174541751117926), np.float64(-5.3197238876669116e-05), np.float64(-0.005114159098053371), np.float64(-8.453444456491388e-05), np.float64(-9.620302834068294e-05), np.float64(-8.453444456491388e-05), np.float64(2.838380977137806e-16), np.float64(0.00021899232992708044), np.float64(3.506811433848991e-17), np.float64(-0.000437767518037571), np.float64(9.202439226816068e-17), np.float64(-1.9721576597114535e-17), np.float64(-7.193180479337539e-17), np.float64(-0.00010576713408408455), np.float64(-0.002707895550718713), np.float64(0.0008691376972381542), np.float64(-0.0009905705214438532), np.float64(-4.0921938286445e-06), np.float64(-3

In [66]:
# Variation 2: Small dataset + Filter ("/r/Antonym", "/r/NotDesires", "/r/Desires", "/r/ObstructedBy", "/r/MannerOf")
# First have to edit scraper for filtering.
df = pd.read_csv("data/conceptnet_api/csv/edge_extractVar2.csv")
# print(df.shape)
# print(df['weight'].describe())
# df.head(3)
ppmi_df_var2 = build_ppmi(conceptnet_filename="data/conceptnet_api/csv/edge_extractVar2.csv", ndim=300)
save_hdf(ppmi_df_var2, filename='data/conceptnet_api/hdf/testVar2.hdf')

gender_pairs = [("/c/en/man", "/c/en/woman")]
attribute_set_a = ["/c/en/he", "/c/en/him", "/c/en/his"]
attribute_set_b = ["/c/en/she", "/c/en/her", "/c/en/hers"]

# --- Run calculations ---
gender_dir = compute_gender_direction(gender_pairs, ppmi_df_var2)
# SAME bias calculations
bias_same_neutral = delta_same(neutral_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
bias_same_male = delta_same(male_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
bias_same_female = delta_same(female_words_target, attribute_set_a, attribute_set_b, ppmi_df_var2)
# Direct bias calculations
bias_direct = direct_bias("/c/en/doctor", gender_dir, ppmi_df_var2)
# Saving direct bias to csv 
female_biases = direct_bias_wordlist(female_words_target, gender_dir, ppmi_df_var2, "female", "data/heuristic/directBias/variation2/female_bias.csv")
male_biases = direct_bias_wordlist(male_words_target, gender_dir, ppmi_df_var2, "male", "data/heuristic/directBias/variation2/male_bias.csv")
neutral_biases = direct_bias_wordlist(neutral_words_target, gender_dir, ppmi_df_var2, "neutral", "data/heuristic/directBias/variation2/neutral_bias.csv")

print("Gender direction (preview):", gender_dir[:5])
print("SAME bias score neutral:", bias_same_neutral)
print("SAME bias score male:", bias_same_male)
print("SAME bias score female:", bias_same_female)
print("Direct bias for '/c/en/doctor':", bias_direct)

[np.float64(0.9606691714049619), np.float64(-0.013208339763527028), np.float64(0.05485554977030825), np.float64(0.004811274377143019), np.float64(0.005241815779221594), np.float64(-0.0028474591079417464), np.float64(-0.0027249844564615267), np.float64(0.000700778849169505), np.float64(-2.921350011573963e-16), np.float64(-0.0007234359787054614), np.float64(-3.0270502610879597e-16), np.float64(-0.002148047945848492), np.float64(-0.0014801259021234277), np.float64(5.4780592781057506e-17), np.float64(-0.0014801259021234277), np.float64(3.815885030663132e-17), np.float64(-2.921350011573963e-16), np.float64(5.937152248018997e-17), np.float64(-0.0011890843841112979), np.float64(-6.042787909957983e-17), np.float64(7.0432783395185e-17), np.float64(0.0007130113864362554), np.float64(1.747258001787543e-16), np.float64(0.00030317110216660707), np.float64(-0.0005644662438168622), np.float64(-4.1197083668451945e-17), np.float64(-0.0017299870814776299), np.float64(-1.2877009601695023e-16), np.float64

In [None]:
def check_vocab_coverage(word_list, ppmi_df):
    present = [w for w in word_list if w in ppmi_df.index]
    missing = [w for w in word_list if w not in ppmi_df.index]
    print(f"✔ Found {len(present)} words, ❌ Missing {len(missing)} words")
    return present, missing

print("Neutral coverage:")
_, _ = check_vocab_coverage(neutral_words_target, ppmi_df_var2)

print("Male coverage:")
_, _ = check_vocab_coverage(male_words_target, ppmi_df_var2)

print("Female coverage:")
_, _ = check_vocab_coverage(female_words_target, ppmi_df_var2)

print(ppmi_df_var2.index)

## From here, realised that we are not getting alot of coverage.

Neutral coverage:
✔ Found 49 words, ❌ Missing 25 words
Male coverage:
✔ Found 21 words, ❌ Missing 502 words
Female coverage:
✔ Found 15 words, ❌ Missing 352 words
Index(['/c/en/help_child', '/c/en/adult', '/c/en/man', '/c/en/sign_contract',
       '/c/en/dress_herself', '/c/en/sheep', '/c/en/drink_beer', '/c/en/work',
       '/c/en/men', '/c/en/woman',
       ...
       '/c/en/antiwomen', '/c/en/wife', '/c/en/womenkind', '/c/en/womenfolk',
       '/c/en/women's_liberation', '/c/en/worker', '/c/en/bank_paycheck',
       '/c/en/stock_shelves', '/c/en/robot', '/c/en/drive_to_work'],
      dtype='object', length=824)
