In [1]:
# Imports
import pandas as pd
import spacy
import re
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from transformers import pipeline, set_seed
import random
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.preprocessing import normalize

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/MyDrive/LLM_thesis/filtered_df.parquet'
filtered_df = pd.read_parquet(file_path)

### Sentence selection procedure in sentence_selection.ipynb

Added category 8 that contains some male stereotypes to see how the model represents these (categories: 8,9,12,13,16)

In [4]:
# Selected sentences without the pronouns
women_sentences = {
    1: [
        "emotional and irrational",  # stereotype itself
        "the breakup is hard on her",
        "overreacts",
        "cries every night",
        "spends 200€ at the hairdresser",
        "craves attention"
    ],
    2: [
        "gentle, kind, and submissive",  # stereotype itself
        "is kind and caring",
        "has to obey",
        "always plays by the rules",
        "forgives them quickly and easily",
        "doesn’t hurt a fly"
    ],
    3: [
        "empathetic and caring",  # stereotype itself
        "offers a supportive ear",
        "is very nurturing",
        "likes helping people",
        "hosts a charity fundraiser",
        "is compassionate"
    ],
    4: [
        "neat and diligent",  # stereotype itself
        "cleans the floor every day",
        "wishes for a tidy house",
        "consistently practices personal hygiene habits",
        "folds the laundry",
        "changes the bedsheets again"
    ],
    5: [
        "social",  # stereotype itself
        "is friendly",
        "wants to make more friends",
        "likes small talk",
        "likes to work in groups",
        "is communicative"
    ],
    6: [
        "weak",  # stereotype itself
        "needs help with heavy bags",
        "is scared to be home alone",
        "is not built for hard physical work",
        "can’t even open a bottle",
        "is not strong enough"
    ],
    7: [
        "beautiful",  # stereotype itself
        "could easily become a model",
        "wins the beauty contest",
        "takes care of appearance",
        "is slim and graceful",
        "looks good at the party"
    ],
    8: [ # the control has male stereotypes
        "tough and rough",
        "self-confident",
        "leaders",
        "providers",
        "strong"
    ]
}

### Non-instruction tuned LlaMA2 -> repeat embedding procedure but with non-instruction tuned version

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"  # automatically places model across GPU/CPU if needed
)
model.eval()

In [8]:
# Gender direction
def get_gender_direction(layer=28):
    male_terms = ["he", "him", "man", "father", "boy"]
    female_terms = ["she", "her", "woman", "mother", "girl"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

# Function to extract the embeddings
def get_embedding(text, layer=28):
    tokens = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    input_ids = tokens["input_ids"][0]
    token_strs = tokenizer.convert_ids_to_tokens(input_ids)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    token_embeddings = outputs.hidden_states[layer][0]

    # Find non-special tokens -> removes these token because they were diluting the gender signal
    valid_idxs = [i for i, tok in enumerate(token_strs) if tok not in ['<s>', '</s>']]
    content_embeddings = token_embeddings[valid_idxs]

    vec = content_embeddings.mean(dim=0).cpu().numpy()
    return vec

In [9]:
def project_on_gender_axis(embedding, gender_direction):
    return cosine_similarity([embedding], [gender_direction])[0][0]

In [10]:
# Testing the gender signal
gender_direction = get_gender_direction(layer=28)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "father", "mother", "it"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=28)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.2027
   she: -0.3375
   him: 0.2040
   her: -0.3319
   man: 0.1638
 woman: -0.2689
   boy: 0.0200
  girl: -0.2619
father: -0.0109
mother: -0.3088
    it: 0.0008


In [11]:
# Checking tokenization
word_to_check = "father"
tokens = tokenizer(word_to_check, return_tensors="pt", add_special_tokens=True)
input_ids = tokens["input_ids"][0]
token_strs = tokenizer.convert_ids_to_tokens(input_ids)

print(f"Tokenization for '{word_to_check}': {token_strs}")

Tokenization for 'father': ['<s>', '▁father']


- I experimented with multiple layers and consistently observed the same problem: the word “father”, despite being included as a male anchor term in the gender direction, consistently projected as gender-neutral or mildly feminine. This is a critical failure, as “father” should strongly align with male-associated representations. I verified that the tokenization of the word was correct (i.e., it was not split into subwords or misrepresented), ruling out preprocessing issues. The fact that “father”—a prototypical male term—receives a low or even negative projection score indicates that the model fails to establish a reliable separation between male and female concepts in embedding space. This undermines the validity of any downstream analysis based on the computed gender direction. For this reason, I excluded the non-instruction-tuned LLaMA-2 model from the final results.

In [None]:
male_words = ["he", "him", "man", "boy"]
female_words = ["she", "her", "woman", "girl"]

for layer in range(24, 30):
    gender_direction = get_gender_direction(layer)

    male_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in male_words]
    female_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in female_words]

    separation = np.mean(male_scores) - np.mean(female_scores)
    print(f"Layer {layer:2d}: separation score = {separation:.4f}")

Layer 24: separation score = 0.4113
Layer 25: separation score = 0.4075
Layer 26: separation score = 0.4102
Layer 27: separation score = 0.4085
Layer 28: separation score = 0.4112
Layer 29: separation score = 0.4016
