# Week 3 Experiment: Sentence-Transformer Embeddings and Cosine Similarity

This notebook:
1. Loads the 300-statement CSV (`physics`, `anthropology`, `politics`).
2. Embeds **text only** using a BERT-based SentenceTransformer model.
3. Computes pairwise cosine similarities with explicit double `for` loops.
4. Plots within-class and cross-class similarity histograms.
5. Visualizes embeddings with PCA (2D) colored by class.

In [2]:
 %pip install -q sentence-transformers scikit-learn seaborn

Note: you may need to restart the kernel to use updated packages.


In [None]:
# If needed, uncomment and run:
# %pip install -q sentence-transformers scikit-learn seaborn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

sns.set_style('whitegrid')
RANDOM_STATE = 42

In [1]:
csv_path = 'week-3-experiment.csv'
df = pd.read_csv(csv_path)

required_cols = {'label', 'text'}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f'Missing required columns: {missing}')

print('Dataset shape:', df.shape)
print('Label counts:')
print(df['label'].value_counts())
df.head()

NameError: name 'pd' is not defined

In [None]:
# BERT-based sentence-transformer model
model_name = 'sentence-transformers/bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

texts = df['text'].tolist()
embeddings = model.encode(
    texts,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

# Store output as <label, text, vector>
df['vector'] = list(embeddings)
print('Embeddings shape:', embeddings.shape)
df[['label', 'text', 'vector']].head(3)

In [None]:
def pairwise_cosine_within_class(vectors: np.ndarray) -> np.ndarray:
    """Compute pairwise cosine similarities with explicit double for-loops."""
    sims = []
    n = len(vectors)
    for i in range(n):
        for j in range(i + 1, n):
            sims.append(float(np.dot(vectors[i], vectors[j])))
    return np.array(sims)

def pairwise_cosine_across_classes(v1: np.ndarray, v2: np.ndarray) -> np.ndarray:
    """Compute cross-class cosine similarities with explicit double for-loops."""
    sims = []
    for i in range(len(v1)):
        for j in range(len(v2)):
            sims.append(float(np.dot(v1[i], v2[j])))
    return np.array(sims)

In [None]:
labels = sorted(df['label'].unique())
vectors_by_label = {label: np.stack(df.loc[df['label'] == label, 'vector'].values) for label in labels}

within_class_sims = {}
for label in labels:
    within_class_sims[label] = pairwise_cosine_within_class(vectors_by_label[label])

fig, axes = plt.subplots(1, len(labels), figsize=(6 * len(labels), 4), sharey=True)
if len(labels) == 1:
    axes = [axes]

for ax, label in zip(axes, labels):
    sims = within_class_sims[label]
    ax.hist(sims, bins=40, alpha=0.8, color='steelblue', edgecolor='black')
    ax.set_title(f'Within-class: {label}')
    ax.set_xlabel('Cosine similarity')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

pd.DataFrame({
    'label': labels,
    'pairs': [len(within_class_sims[l]) for l in labels],
    'mean': [within_class_sims[l].mean() for l in labels],
    'std': [within_class_sims[l].std() for l in labels],
})

In [None]:
cross_class_sims = {}
for i in range(len(labels)):
    for j in range(i + 1, len(labels)):
        l1, l2 = labels[i], labels[j]
        key = f'{l1} vs {l2}'
        cross_class_sims[key] = pairwise_cosine_across_classes(vectors_by_label[l1], vectors_by_label[l2])

fig, axes = plt.subplots(1, len(cross_class_sims), figsize=(6 * len(cross_class_sims), 4), sharey=True)
if len(cross_class_sims) == 1:
    axes = [axes]

for ax, (pair_name, sims) in zip(axes, cross_class_sims.items()):
    ax.hist(sims, bins=40, alpha=0.8, color='darkorange', edgecolor='black')
    ax.set_title(f'Cross-class: {pair_name}')
    ax.set_xlabel('Cosine similarity')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

pd.DataFrame({
    'pair': list(cross_class_sims.keys()),
    'pairs': [len(v) for v in cross_class_sims.values()],
    'mean': [v.mean() for v in cross_class_sims.values()],
    'std': [v.std() for v in cross_class_sims.values()],
})

In [None]:
pca = PCA(n_components=2, random_state=RANDOM_STATE)
coords = pca.fit_transform(embeddings)

viz_df = pd.DataFrame({
    'pc1': coords[:, 0],
    'pc2': coords[:, 1],
    'label': df['label']
})

plt.figure(figsize=(9, 7))
sns.scatterplot(data=viz_df, x='pc1', y='pc2', hue='label', alpha=0.85, s=45)
plt.title('PCA projection of text embeddings')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='Label')
plt.tight_layout()
plt.show()

print('Explained variance ratio:', pca.explained_variance_ratio_)