In [2]:
# Imports
import pandas as pd
import spacy
import re
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from transformers import pipeline, set_seed
import random
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.preprocessing import normalize

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
file_path = '/content/drive/MyDrive/LLM_thesis/filtered_df.parquet'
filtered_df = pd.read_parquet(file_path)

### Sentence selection procedure in sentence_selection.ipynb

### Testing if my methodology for BERT can work for LLaMA2 as well!

### Adapting embedding analysis for Llama2
- Directional Embedding Probing can work for Llama2 but needs to be adapted.
- Since LLaMA 2 is a causal decoder-only model, and doesn't use [CLS] or [SEP] tokens, some aspects need adjustment.


### First the instruction tuned version (then will try the non-instruction tuned one)

In [None]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"  # automatically places model across GPU/CPU if needed
)
model.eval()

In [29]:
# Gender direction
def get_gender_direction(layer=32):
    male_terms = ["he", "him", "man", "father", "boy"]
    female_terms = ["she", "her", "woman","mother", "girl"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

In [None]:
# Function to extract the embeddings
def get_embedding(text, layer=32):
    tokens = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    input_ids = tokens["input_ids"][0]
    token_strs = tokenizer.convert_ids_to_tokens(input_ids)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    token_embeddings = outputs.hidden_states[layer][0]

    # Find non-special tokens -> removes these token because they were diluting the gender signal
    valid_idxs = [i for i, tok in enumerate(token_strs) if tok not in ['<s>', '</s>']]
    content_embeddings = token_embeddings[valid_idxs]

    vec = content_embeddings.mean(dim=0).cpu().numpy()
    return vec  # skip normalization here since cosine similarity handles it

In [None]:
def project_on_gender_axis(embedding, gender_direction):
    return cosine_similarity([embedding], [gender_direction])[0][0]

Added category 8 that contains some male stereotypes to see how the model represents these (categories: 8,9,12,13,16)

In [5]:
# Selected sentences without the pronouns
women_sentences = {
    1: [
        "emotional and irrational",  # stereotype itself
        "the breakup is hard on her",
        "overreacts",
        "cries every night",
        "spends 200€ at the hairdresser",
        "craves attention"
    ],
    2: [
        "gentle, kind, and submissive",  # stereotype itself
        "is kind and caring",
        "has to obey",
        "always plays by the rules",
        "forgives them quickly and easily",
        "doesn’t hurt a fly"
    ],
    3: [
        "empathetic and caring",  # stereotype itself
        "offers a supportive ear",
        "is very nurturing",
        "likes helping people",
        "hosts a charity fundraiser",
        "is compassionate"
    ],
    4: [
        "neat and diligent",  # stereotype itself
        "cleans the floor every day",
        "wishes for a tidy house",
        "consistently practices personal hygiene habits",
        "folds the laundry",
        "changes the bedsheets again"
    ],
    5: [
        "social",  # stereotype itself
        "is friendly",
        "wants to make more friends",
        "likes small talk",
        "likes to work in groups",
        "is communicative"
    ],
    6: [
        "weak",  # stereotype itself
        "needs help with heavy bags",
        "is scared to be home alone",
        "is not built for hard physical work",
        "can’t even open a bottle",
        "is not strong enough"
    ],
    7: [
        "beautiful",  # stereotype itself
        "could easily become a model",
        "wins the beauty contest",
        "takes care of appearance",
        "is slim and graceful",
        "looks good at the party"
    ],
    8: [ # the control has male stereotypes
        "tough and rough",
        "self-confident",
        "leaders",
        "providers",
        "strong"
    ]
}

In [None]:
# Testing the gender signal
gender_direction = get_gender_direction(layer=32)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "they"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=32)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.2539
   she: -0.0991
   him: 0.3246
   her: -0.1405
   man: 0.3439
 woman: -0.2615
   boy: 0.1096
  girl: -0.2423
  they: 0.0008


In [None]:
male_words = ["he", "him", "man", "boy"]
female_words = ["she", "her", "woman", "girl"]

for layer in range(24, 33):
    gender_direction = get_gender_direction(layer)

    male_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in male_words]
    female_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in female_words]

    separation = np.mean(male_scores) - np.mean(female_scores)
    print(f"Layer {layer:2d}: separation score = {separation:.4f}")

Layer 24: separation score = 0.4386
Layer 25: separation score = 0.4357
Layer 26: separation score = 0.4385
Layer 27: separation score = 0.4368
Layer 28: separation score = 0.4401
Layer 29: separation score = 0.4304
Layer 30: separation score = 0.4316
Layer 31: separation score = 0.4288
Layer 32: separation score = 0.4439


- Layer 32 gave the most optimal separation

In [None]:
def compute_sentence_scores(sentences, gender_direction, layer=32):
    results = []
    for sent in sentences:
        emb = get_embedding(sent, layer=layer)
        score = project_on_gender_axis(emb, gender_direction)
        results.append((sent, round(score, 4)))
    return results

### Same reference points method as BERT!

- The range is 0.60.

In [None]:
range = 0.4439
def label_gender(score):
    # Defines reference points
    reference_points = {
        "strong female-association": -0.35 * range,
        "mild female-association": -0.2 * range,
        "neutral": 0.0,
        "mild male-association": 0.2 * range,
        "strong male-association": 0.35 * range,

    }
    # Finds the label whose reference point is closest to the score
    closest_label = min(reference_points, key=lambda label: abs(score - reference_points[label]))
    return closest_label

# Generates gender direction
gender_direction = get_gender_direction()

# Collects results in a list of dictionaries
embedding_data = []

for cat_id, sentence_list in women_sentences.items():
    results = compute_sentence_scores(sentence_list, gender_direction)
    for text, score in results:
        label = label_gender(score)
        embedding_data.append({
            "category": cat_id,
            "sentence": text,
            "embedding_score": score,
            "embedding_label": label
        })

# Converting to DataFrame
embedding_df = pd.DataFrame(embedding_data)

In [None]:
embedding_df

Unnamed: 0,category,sentence,embedding_score,embedding_label
0,1,emotional and irrational,-0.1437,strong female-association
1,1,the breakup is hard on her,-0.1359,strong female-association
2,1,overreacts,0.0719,mild male-association
3,1,cries every night,-0.0602,mild female-association
4,1,spends 200€ at the hairdresser,-0.1293,strong female-association
5,1,craves attention,-0.0351,neutral
6,2,"gentle, kind, and submissive",-0.2857,strong female-association
7,2,is kind and caring,-0.2011,strong female-association
8,2,has to obey,0.0822,mild male-association
9,2,always plays by the rules,-0.1083,mild female-association


In [None]:
# Filters out the first sentence of each category
filtered_embedding_df = embedding_df.groupby('category').apply(lambda x: x.iloc[1:]).reset_index(drop=True)

# Counts occurrences of each label per category
label_counts = filtered_embedding_df.groupby(['category', 'embedding_label']).size().unstack(fill_value=0)
label_counts

  filtered_embedding_df = embedding_df.groupby('category').apply(lambda x: x.iloc[1:]).reset_index(drop=True)


embedding_label,mild female-association,mild male-association,neutral,strong female-association,strong male-association
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1,1,2,0
2,1,1,0,3,0
3,1,0,2,2,0
4,1,0,1,3,0
5,2,1,0,1,1
6,4,0,0,1,0
7,5,0,0,0,0
8,1,1,2,0,0


### Non-instruction tuned LlaMA2 -> repeat embedding procedure but with non-instruction tuned version

In [6]:
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"  # automatically places model across GPU/CPU if needed
)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_e

In [13]:
# Gender direction
def get_gender_direction(layer=28):
    male_terms = ["he", "him", "man", "father", "boy"]
    female_terms = ["she", "her", "woman", "mother", "girl"]
    male_vecs = [get_embedding(w, layer) for w in male_terms]
    female_vecs = [get_embedding(w, layer) for w in female_terms]
    gender_vec = np.mean(male_vecs, axis=0) - np.mean(female_vecs, axis=0)
    return normalize([gender_vec])[0]

# Function to extract the embeddings
def get_embedding(text, layer=28):
    tokens = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    input_ids = tokens["input_ids"][0]
    token_strs = tokenizer.convert_ids_to_tokens(input_ids)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    token_embeddings = outputs.hidden_states[layer][0]

    # Find non-special tokens -> removes these token because they were diluting the gender signal
    valid_idxs = [i for i, tok in enumerate(token_strs) if tok not in ['<s>', '</s>']]
    content_embeddings = token_embeddings[valid_idxs]

    vec = content_embeddings.mean(dim=0).cpu().numpy()
    return vec

In [14]:
def project_on_gender_axis(embedding, gender_direction):
    return cosine_similarity([embedding], [gender_direction])[0][0]

In [17]:
# Testing the gender signal
gender_direction = get_gender_direction(layer=28)

test_words = ["he", "she", "him", "her", "man", "woman", "boy", "girl", "it"]

print("Cosine similarity with gender direction:\n")
for word in test_words:
    embedding = get_embedding(word, layer=28)
    score = project_on_gender_axis(embedding, gender_direction)
    print(f"{word:>6}: {score:.4f}")

Cosine similarity with gender direction:

    he: 0.2027
   she: -0.3375
   him: 0.2040
   her: -0.3319
   man: 0.1638
 woman: -0.2689
   boy: 0.0200
  girl: -0.2619
    it: 0.0008


In [18]:
male_words = ["he", "him", "man", "boy"]
female_words = ["she", "her", "woman", "girl"]

for layer in range(26, 30):
    gender_direction = get_gender_direction(layer)

    male_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in male_words]
    female_scores = [project_on_gender_axis(get_embedding(w, layer), gender_direction) for w in female_words]

    separation = np.mean(male_scores) - np.mean(female_scores)
    print(f"Layer {layer:2d}: separation score = {separation:.4f}")

Layer 26: separation score = 0.4460
Layer 27: separation score = 0.4451
Layer 28: separation score = 0.4476
Layer 29: separation score = 0.4376


- Layer 28 offers the best gender separation in embedding space

In [19]:
def compute_sentence_scores(sentences, gender_direction, layer=28):
    results = []
    for sent in sentences:
        emb = get_embedding(sent, layer=layer)
        score = project_on_gender_axis(emb, gender_direction)
        results.append((sent, round(score, 4)))
    return results

- The range is the separation score!

In [26]:
range = 0.55
def label_gender(score):
    # Defines reference points
    reference_points = {
        "strong female-association": -0.35 * range,
        "mild female-association": -0.2 * range,
        "neutral": 0.0,
        "mild male-association": 0.2 * range,
        "strong male-association": 0.35 * range,

    }
    # Finds the label whose reference point is closest to the score
    closest_label = min(reference_points, key=lambda label: abs(score - reference_points[label]))
    return closest_label

# Generates gender direction
gender_direction = get_gender_direction()

# Collects results in a list of dictionaries
embedding_data = []

for cat_id, sentence_list in women_sentences.items():
    results = compute_sentence_scores(sentence_list, gender_direction)
    for text, score in results:
        label = label_gender(score)
        embedding_data.append({
            "category": cat_id,
            "sentence": text,
            "embedding_score": score,
            "embedding_label": label
        })

# Converting to DataFrame
non_tuned_embedding_df = pd.DataFrame(embedding_data)

In [28]:
non_tuned_embedding_df

Unnamed: 0,category,sentence,embedding_score,embedding_label
0,1,emotional and irrational,-0.0976,mild female-association
1,1,the breakup is hard on her,-0.105,mild female-association
2,1,overreacts,-0.0668,mild female-association
3,1,cries every night,-0.0771,mild female-association
4,1,spends 200€ at the hairdresser,-0.0784,mild female-association
5,1,craves attention,-0.0545,neutral
6,2,"gentle, kind, and submissive",-0.0775,mild female-association
7,2,is kind and caring,-0.0625,mild female-association
8,2,has to obey,-0.0701,mild female-association
9,2,always plays by the rules,-0.0811,mild female-association


In [None]:
# Filters out the first sentence of each category
filtered_embedding_df = non_tuned_embedding_df.groupby('category').apply(lambda x: x.iloc[1:]).reset_index(drop=True)

# Counts occurrences of each label per category
label_counts = filtered_embedding_df.groupby(['category', 'embedding_label']).size().unstack(fill_value=0)
label_counts

### Adapting Prompt Analysis
- Supported by: Brown et al. [2020](https://arxiv.org/abs/2005.14165)
  Discusses restructuring prompts for CLMs by removing masked tokens and using prefix-style prompts instead.
- Supported by: Zhao et al. [2021](https://arxiv.org/abs/2102.09690)
  Explores how to better adapt prompts for causal models.


To adapt my prompt analysis method from BERT to LLaMA 2, I modified the pipeline to suit LLaMA 2’s causal language modeling setup, which does not support masked token prediction. Instead of using the [MASK] token, I restructured the prompts to end right before the target word (e.g.,"Fill in the blank with a pronoun: ___ is a nurse."") and retrieved the top-3 predictions for the next token using the model’s logits. I  filtered out blank or non-word tokens to extract meaningful predictions, ensuring consistency with the original BERT-based method. This will allow me to compare LLaMA 2's output biases using the same evaluation framework.

### Causal Probing set up
- Inspired by: Goldfarb-Tarrant et al., 2021 (Intrinsic Probing) & Brown et al., 2020 (GPT-3 prompt sensitivity)
- Since LLaMA2 does not support masked token prediction like BERT, I adapted the method by designing fill-in-the-blank prompts (e.g., “Fill in the blank with a pronoun: ___ likes helping people.”).
- I implemented additional logic to filter out blank or punctuation-based completions and ensure that only words are returned, maintaining the top-3 most confident predictions.
- This setup allows for direct comparison with the prompt completion approach used in BERT.










In [None]:
prompt = "Fill in the blank with: ___likes helping people."

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs['input_ids']

# Gets the logits for the next token only
with torch.no_grad():
    outputs = model(**inputs)
    next_token_logits = outputs.logits[0, -1]  # last token in the sequence

# Converts logits to probabilities
probs = F.softmax(next_token_logits, dim=-1)

In [None]:
# Gets top-3 predicted tokens that are words (no empty spaces or punctuation) and their scores
top_k = 10  # Searches deeper than 3 to find 3 good ones
top_probs, top_indices = torch.topk(probs, top_k)

print(f"Prompt: {prompt}")
print("Top predictions:")

count = 0
i = 0
while count < 3 and i < top_k:
    token = tokenizer.decode(top_indices[i].item()).strip()
    score = top_probs[i].item()

    if token and re.fullmatch(r"[A-Za-z]+", token):  # only alphabetic tokens (no punctuation, symbols etc)
        count += 1
        print(f"{count}. {token} (confidence: {score:.4f})")

    i += 1

Prompt: Fill in the blank with a pronoun: ___was compassionate.
Top predictions:
1. He (confidence: 0.0534)
2. I (confidence: 0.0325)
3. She (confidence: 0.0209)


I can wrap this into a loop to evaluate many prompts (e.g., my full stereotype list), and store the top-3 results in a DataFrame — similar to how I did it with BERT.