In [1]:
# Imports
import pandas as pd
import spacy
import re
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from transformers import pipeline, set_seed
import random
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.preprocessing import normalize

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/MyDrive/LLM_thesis/filtered_df.parquet'
filtered_df = pd.read_parquet(file_path)

### Sentence selection procedure in sentence_selection.ipynb

Added category 8 that contains some male stereotypes to see how the model represents these (categories: 8,9,12,13,16)

In [21]:
# Selected sentences without the pronouns
women_sentences = {
    1: [
        "emotional and irrational",  # stereotype itself
        "the breakup is hard on her",
        "overreacts",
        "cries every night",
        "spends 200€ at the hairdresser",
        "craves attention"
    ],
    2: [
        "gentle, kind, and submissive",  # stereotype itself
        "is kind and caring",
        "has to obey",
        "always plays by the rules",
        "forgives them quickly and easily",
        "doesn’t hurt a fly"
    ],
    3: [
        "empathetic and caring",  # stereotype itself
        "offers a supportive ear",
        "is very nurturing",
        "likes helping people",
        "hosts a charity fundraiser",
        "is compassionate"
    ],
    4: [
        "neat and diligent",  # stereotype itself
        "cleans the floor every day",
        "wishes for a tidy house",
        "consistently practices personal hygiene habits",
        "folds the laundry",
        "changes the bedsheets again"
    ],
    5: [
        "social",  # stereotype itself
        "is friendly",
        "wants to make more friends",
        "likes small talk",
        "likes to work in groups",
        "is communicative"
    ],
    6: [
        "weak",  # stereotype itself
        "needs help with heavy bags",
        "is scared to be home alone",
        "is not built for hard physical work",
        "can’t even open a bottle",
        "is not strong enough"
    ],
    7: [
        "beautiful",  # stereotype itself
        "could easily become a model",
        "wins the beauty contest",
        "takes care of appearance",
        "is slim and graceful",
        "looks good at the party"
    ],
    8: [ # the control has male stereotypes
        "tough and rough",
        "self-confident",
        "leaders",
        "providers",
        "strong",
    ]
}

### Non-instruction tuned LlaMA2 -> repeat embedding procedure but with non-instruction tuned version

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"  # automatically places model across GPU/CPU if needed
)
model.eval()

In [28]:
# Gender direction
def get_gender_direction(layer=28):
    male_terms = ["he", "him", "man", "father", "boy"]
    female_terms = ["she", "her", "woman", "mother", "girl"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

In [29]:
# Function to extract the embeddings
def get_embedding(text, layer=28):
    tokens = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    input_ids = tokens["input_ids"][0]
    token_strs = tokenizer.convert_ids_to_tokens(input_ids)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    token_embeddings = outputs.hidden_states[layer][0]

    # Find non-special tokens -> removes these token because they were diluting the gender signal
    valid_idxs = [i for i, tok in enumerate(token_strs) if tok not in ['<s>', '</s>']]
    content_embeddings = token_embeddings[valid_idxs]

    vec = content_embeddings.mean(dim=0).cpu().numpy()
    return vec

In [30]:
def project_on_gender_axis(embedding, gender_direction):
    return cosine_similarity([embedding], [gender_direction])[0][0]

In [31]:
# Testing the gender signal for the same set of terms as LlaMA-2-chat
gender_direction = get_gender_direction(layer=28)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "it"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=28)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.2027
   she: -0.3375
   him: 0.2040
   her: -0.3319
   man: 0.1638
 woman: -0.2689
   boy: 0.0200
  girl: -0.2619
    it: 0.0008


- The score for boy is a little low so I'm gonna try to find the most optimal layer or even change the set of terms for the gender direction!

In [32]:
male_words = ["he", "him", "man", "boy"]
female_words = ["she", "her", "woman", "girl"]

for layer in range(24, 33):
    gender_direction = get_gender_direction(layer)

    male_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in male_words]
    female_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in female_words]

    separation = np.mean(male_scores) - np.mean(female_scores)
    print(f"Layer {layer:2d}: separation score = {separation:.4f}")

Layer 24: separation score = 0.4469
Layer 25: separation score = 0.4434
Layer 26: separation score = 0.4460
Layer 27: separation score = 0.4451
Layer 28: separation score = 0.4476
Layer 29: separation score = 0.4376
Layer 30: separation score = 0.4347
Layer 31: separation score = 0.4298
Layer 32: separation score = 0.3645


- Layer 28 gives the most optimal gender direction!

- Simplify the gender direction and gonna take out "father" and "mother" to see if the gender signal improves. Since untuned models (especially pre-instruction-tuned ones) lack grounding in social roles unless those concepts are statistically salient in pretraining, I thought I would test the gender signal without these.

In [33]:
# Gender direction
def get_gender_direction(layer=28):
    male_terms = ["he", "him", "man", "boy"]
    female_terms = ["she", "her", "woman", "girl"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

In [34]:
# Testing the gender signal
gender_direction = get_gender_direction(layer=28)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "it"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=28)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.1966
   she: -0.3679
   him: 0.1906
   her: -0.3712
   man: 0.1598
 woman: -0.3061
   boy: -0.0005
  girl: -0.2943
    it: -0.0106


Scores only worsened with boy considered as neutral! Maybe adding more terms will actually improve the gender signal.


In [35]:
# Gender direction
def get_gender_direction(layer=28):
    male_terms = ["he", "him", "man", "father", "boy", "male"]
    female_terms = ["she", "her", "woman", "mother", "girl", "female"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

In [36]:
# Testing the gender signal
gender_direction = get_gender_direction(layer=28)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "it"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=28)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.1912
   she: -0.3274
   him: 0.1941
   her: -0.3155
   man: 0.1683
 woman: -0.2806
   boy: 0.0242
  girl: -0.2688
    it: -0.0033


- Boy improved to 0.0242 and the gender direction seems more stable, so maybe I should add more terms to the gender signal?
- Brother and sister are (a) unambiguously gendered, (b) common in untuned corpora (news, Wikipedia), and (c) less semantically loaded than “husband/wife” or “gentleman/lady,” so they usually reinforce the core gender axis without drifting toward age, formality, or marital‑status topics.

In [37]:
# Gender direction
def get_gender_direction(layer=28):
    male_terms = ["he", "him", "man", "father", "boy", "male", "brother"]
    female_terms = ["she", "her", "woman", "mother", "girl", "female", "sister"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

In [38]:
# Testing the gender signal
gender_direction = get_gender_direction(layer=28)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "it"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=28)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.1765
   she: -0.3161
   him: 0.1752
   her: -0.2905
   man: 0.1609
 woman: -0.2655
   boy: 0.0326
  girl: -0.2659
    it: -0.0147


- Score for boy improved! Even for the instruction-tuned version the score is still relatively low (0.1) so my aim is to get boy to = 0.05.

In [39]:
# Gender direction
def get_gender_direction(layer=28):
    male_terms = ["he", "him", "man", "father", "boy", "male", "brother", "husband"]
    female_terms = ["she", "her", "woman", "mother", "girl", "female", "sister", "wife"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

In [40]:
# Testing the gender signal
gender_direction = get_gender_direction(layer=28)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "it"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=28)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.1783
   she: -0.2983
   him: 0.1847
   her: -0.2550
   man: 0.1662
 woman: -0.2579
   boy: 0.0457
  girl: -0.2488
    it: -0.0093


- I'm gonna stop here because boy increased to 0.0457 and it got smaller and closer to 0 (more neutral) which shows a better gender signal.

In [46]:
def compute_sentence_scores(sentences, gender_direction, layer=28):
    results = []
    for sent in sentences:
        emb = get_embedding(sent, layer=layer)
        score = project_on_gender_axis(emb, gender_direction)
        results.append((sent, round(score, 4)))
    return results

In [49]:
r = 0.47
def label_gender(score):
    # Defines reference points
    reference_points = {
        "strong female-association": -0.35 * r,
        "mild female-association": -0.2 * r,
        "neutral": 0.0,
        "mild male-association": 0.2 * r,
        "strong male-association": 0.35 * r,

    }
    # Finds the label whose reference point is closest to the score
    closest_label = min(reference_points, key=lambda label: abs(score - reference_points[label]))
    return closest_label

# Generates gender direction
gender_direction = get_gender_direction()

# Collects results in a list of dictionaries
embedding_data = []

for cat_id, sentence_list in women_sentences.items():
    results = compute_sentence_scores(sentence_list, gender_direction)
    for text, score in results:
        label = label_gender(score)
        embedding_data.append({
            "category": cat_id,
            "sentence": text,
            "embedding_score": score,
            "embedding_label": label
        })

# Converting to DataFrame
embedding_df = pd.DataFrame(embedding_data)

In [50]:
embedding_df

Unnamed: 0,category,sentence,embedding_score,embedding_label
0,1,emotional and irrational,-0.0792,mild female-association
1,1,the breakup is hard on her,-0.0814,mild female-association
2,1,overreacts,-0.0567,mild female-association
3,1,cries every night,-0.0626,mild female-association
4,1,spends 200€ at the hairdresser,-0.0567,mild female-association
5,1,craves attention,-0.0419,neutral
6,2,"gentle, kind, and submissive",-0.052,mild female-association
7,2,is kind and caring,-0.0358,neutral
8,2,has to obey,-0.0422,neutral
9,2,always plays by the rules,-0.0627,mild female-association


Mostly mild female associations with a couple of neutral associations!