In [None]:
# Path to the dataset CSV file
DATASET_PATH = "../markedpersonas/data/gpt4_main_generations.csv"

# Pretrained model for sentiment analysis
SENTIMENT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"

# Reproducibility
SEED = 42

In [None]:
# Imports and configuration
import os
import math
import json
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from tqdm.auto import tqdm
from pathlib import Path

print("Loading PyTorch library...")
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available:  {torch.cuda.is_available()}\n")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Import transformers components
print("Loading transformers library...")
from transformers import AutoTokenizer, AutoModelForSequenceClassification

torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [None]:
# This will take a few minutes to run
from cs7313.embeddings import EmbeddingExtractor

extractor = EmbeddingExtractor(SENTIMENT_MODEL)
embeddings = extractor(df["text"].to_numpy())
print(f"Extracted embeddings shape: {embeddings.shape}")

In [None]:
print(f"Loading {SENTIMENT_MODEL} model...")
tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL, output_attentions=True)
model = model.to(device)
model.eval()

print("Model loaded successfully!")
id2label = model.config.id2label
id2label

In [None]:
df = pd.read_csv(DATASET_PATH)
df = df.dropna(subset=["text"]).reset_index(drop=True)

In [None]:
df.head()

In [None]:
# Helper functions: sentiment + attention-derived importance
SPECIAL_IDS = set(tokenizer.all_special_ids)

def run_model(text: str, max_length: int = 256):
    """Run the classifier and return sentiment, score, token list, token-level attention importance, and CLS embedding."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length, padding=False)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True, output_hidden_states=True)
    probs = torch.softmax(outputs.logits[0], dim=-1)
    pred_idx = int(torch.argmax(probs).item())
    pred_label = id2label[pred_idx]
    pred_score = float(probs[pred_idx].item())
    # Average heads on last layer and use CLS row as importance source
    last_attn = outputs.attentions[-1].mean(dim=1)[0]  # seq x seq
    cls_to_tokens = last_attn[0]  # attention from CLS token
    cls_to_tokens = cls_to_tokens / cls_to_tokens.sum()
    input_ids = inputs["input_ids"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    importance = []
    for tok, score, tok_id in zip(tokens, cls_to_tokens.tolist(), input_ids):
        if tok_id in SPECIAL_IDS:
            continue
        importance.append((tok, score))
    # CLS embedding for clustering
    cls_embedding = outputs.hidden_states[-1][0, 0, :].detach().cpu().numpy()
    return {
        "label": pred_label,
        "score": pred_score,
        "tokens": tokens,
        "importance": importance,
        "cls_embedding": cls_embedding,
    }

In [None]:
# Run sentiment + attention over a subset for speed
N_EVAL = min(300, len(df))
sample_df = df.sample(n=N_EVAL, random_state=SEED).reset_index(drop=True)

sentiments = []
scores = []
importances = []
emb_list = []

for text in tqdm(sample_df["text"], desc="Scoring", total=len(sample_df)):
    out = run_model(text)
    sentiments.append(out["label"])
    scores.append(out["score"])
    importances.append(out["importance"])
    emb_list.append(out["cls_embedding"])

sample_df["sentiment"] = sentiments
sample_df["sentiment_score"] = scores
sample_df["importance"] = importances
sample_df["cls_embedding"] = emb_list

embeddings = np.vstack(emb_list)
sample_df.head()[[c for c in ["text","sentiment","sentiment_score"] if c in sample_df.columns]]

In [None]:
# pd.set_option('display.max_colwidth', None)
sample_df[sample_df["sentiment"] == "NEGATIVE"]

In [None]:
# Aggregate token importance across the sample
from collections import Counter

token_scores = Counter()
for tok_scores in sample_df["importance"]:
    for tok, score in tok_scores:
        token_scores[tok] += score

top_tokens = token_scores.most_common(25)
importance_df = pd.DataFrame(top_tokens, columns=["token", "importance"])
fig = px.bar(importance_df, x="importance", y="token", orientation="h", title="Top tokens by attention importance")
fig.update_layout(height=600)
fig.show()
importance_df.head()

In [None]:
# Attention heatmap for a single example
example_idx = 0
example_text = sample_df.loc[example_idx, "text"]
example_out = run_model(example_text)
tokens = [tok for tok, _ in example_out["importance"]]
scores = [score for _, score in example_out["importance"]]

plt.figure(figsize=(max(12, len(tokens) * 0.4), 2.5))
sns.heatmap(np.array([scores]), cmap="viridis", cbar=True, xticklabels=tokens, yticklabels=["CLS â†’ token"], vmin=0.0, vmax=max(scores) if scores else 1.0)
plt.xticks(rotation=90)
plt.title("Last-layer attention from CLS to tokens")
plt.show()

In [None]:
# Clustering the CLS embeddings to inspect geometry vs sentiment
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap

print("Applying dimensionality reduction on CLS embeddings...")
tsne = TSNE(n_components=2, random_state=SEED, perplexity=30, max_iter=800)
emb_tsne = tsne.fit_transform(embeddings)

pca = PCA(n_components=2, random_state=SEED)
emb_pca = pca.fit_transform(embeddings)

reducer = umap.UMAP(n_components=2, random_state=SEED)
emb_umap = reducer.fit_transform(embeddings)

sample_df["tsne_x"], sample_df["tsne_y"] = emb_tsne[:,0], emb_tsne[:,1]
sample_df["pca_x"], sample_df["pca_y"] = emb_pca[:,0], emb_pca[:,1]
sample_df["umap_x"], sample_df["umap_y"] = emb_umap[:,0], emb_umap[:,1]

fig = px.scatter(sample_df, x="umap_x", y="umap_y", color="sentiment", hover_data=["sentiment_score", "text"], title="UMAP of CLS embeddings colored by sentiment", width=900, height=650)
fig.update_traces(marker=dict(size=7, opacity=0.75))
fig.show()

In [None]:
# Simple identity marker probe: do attention scores spike when markers appear?
identity_markers = [
    "woman", "man", "female", "male", "girl", "boy",
    "black", "white", "asian", "latino", "arab", "indian",
    "christian", "muslim", "jewish", "atheist", "gay", "queer", "trans"
]

marker_rows = []
for marker in identity_markers:
    total_score = 0.0
    mention_rows = 0
    for text, tok_scores in zip(sample_df["text"], sample_df["importance"]):
        lower_text = str(text).lower()
        if marker in lower_text:
            mention_rows += 1
            for tok, score in tok_scores:
                if marker in tok:
                    total_score += score
    marker_rows.append({"marker": marker, "mentions": mention_rows, "token_importance": total_score})
marker_df = pd.DataFrame(marker_rows)
marker_df = marker_df.sort_values(by="token_importance", ascending=False)

fig = px.bar(marker_df, x="token_importance", y="marker", orientation="h", title="Attention mass on identity markers (higher = model focuses more)", hover_data=["mentions"])
fig.update_layout(height=650)
fig.show()
marker_df.head(10)

# Notes and next steps
- Raise `MAX_ROWS` / `N_EVAL` if you have GPU time to cover more generations.
- Swap `SENTIMENT_MODEL` for domain-specific checkpoints if needed; ensure they expose attentions.
- Add more identity markers or regex patterns to `identity_markers` for targeted probes.
- For reproducibility run `pip install transformers datasets seaborn plotly umap-learn scikit-learn tqdm` in your environment before executing cells.