 # Notebook 7: Semantic Embeddings

 ## Purpose
 - Generate BERT embeddings for articles
 - Calculate semantic similarity scores
 - Demographic group distance in embedding space
 - Clustering analysis
 - Dimensionality reduction (PCA/UMAP)

 ## Outputs
 - `data/processed/embeddings.h5`
 - `data/processed/semantic_features.parquet`

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import h5py

print("Libraries imported")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported


In [2]:
BASE_DIR = Path('..')
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
INPUT_FILE = PROCESSED_DIR / 'articles_with_events.csv'
EMBEDDING_FILE = PROCESSED_DIR / 'embeddings_sample.h5'
OUTPUT_FILE = PROCESSED_DIR / 'semantic_features.parquet'

In [3]:
print("Loading data...")
df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df):,} articles")

Loading data...
Loaded 49,926 articles


  df = pd.read_csv(INPUT_FILE)


In [4]:
print("Loading BERT model...")
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device('mps')
model.to(device)
print(f"Model loaded on {device}")

Loading BERT model...
Model loaded on mps


In [5]:
def get_bert_embedding(text, max_length=512):
    '''Generate BERT embedding for text'''
    if not text:
        return np.zeros(768)
    
    inputs = tokenizer(text[:1000], return_tensors='pt', 
                      max_length=max_length, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
    return embedding

In [6]:
print("Generating embeddings (this will take time)...")
embeddings_list = []
batch_size = 100

for i in range(0, len(df), batch_size):
    batch = df['article_text'].iloc[i:i+batch_size]
    batch_embeddings = [get_bert_embedding(text) for text in batch]
    embeddings_list.extend(batch_embeddings)
    
    if (i+batch_size) % 1000 == 0:
        print(f"  Processed {i+batch_size:,}/{len(df):,} articles")

embeddings = np.array(embeddings_list)
print(f"Generated {len(embeddings)} embeddings of shape {embeddings.shape}")

Generating embeddings (this will take time)...
  Processed 1,000/49,926 articles
  Processed 2,000/49,926 articles
  Processed 3,000/49,926 articles
  Processed 4,000/49,926 articles
  Processed 5,000/49,926 articles
  Processed 6,000/49,926 articles
  Processed 7,000/49,926 articles
  Processed 8,000/49,926 articles
  Processed 9,000/49,926 articles
  Processed 10,000/49,926 articles
  Processed 11,000/49,926 articles
  Processed 12,000/49,926 articles
  Processed 13,000/49,926 articles
  Processed 14,000/49,926 articles
  Processed 15,000/49,926 articles
  Processed 16,000/49,926 articles
  Processed 17,000/49,926 articles
  Processed 18,000/49,926 articles
  Processed 19,000/49,926 articles
  Processed 20,000/49,926 articles
  Processed 21,000/49,926 articles
  Processed 22,000/49,926 articles
  Processed 23,000/49,926 articles
  Processed 24,000/49,926 articles
  Processed 25,000/49,926 articles
  Processed 26,000/49,926 articles
  Processed 27,000/49,926 articles
  Processed 28,00

In [7]:
print("Saving embeddings to HDF5...")
with h5py.File(EMBEDDING_FILE, 'w') as f:
    f.create_dataset('embeddings', data=embeddings)
    f.create_dataset('urls', data=df['url'].astype(str).values)
print(f"Saved to {EMBEDDING_FILE}")

Saving embeddings to HDF5...
Saved to ../data/processed/embeddings_sample.h5


In [8]:
print("Performing PCA dimensionality reduction...")
pca = PCA(n_components=50)
embeddings_pca = pca.fit_transform(embeddings)
print(f"PCA complete: {embeddings_pca.shape}")
print(f"  Explained variance: {pca.explained_variance_ratio_.sum():.2%}")

Performing PCA dimensionality reduction...
PCA complete: (49926, 50)
  Explained variance: 69.06%


In [9]:
print("Performing clustering...")
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings_pca)
df['semantic_cluster'] = clusters
print(f"Clustering complete: {n_clusters} clusters")

Performing clustering...
Clustering complete: 10 clusters


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
print("Calculating semantic features...")
cluster_centers = kmeans.cluster_centers_
df['distance_to_cluster_center'] = [
    np.linalg.norm(embeddings_pca[i] - cluster_centers[clusters[i]])
    for i in range(len(embeddings_pca))
]

# Save semantic features
semantic_features = df[['url', 'semantic_cluster', 'distance_to_cluster_center']].copy()
semantic_features.to_parquet(OUTPUT_FILE, index=False)
print(f"Saved semantic features to {OUTPUT_FILE}")

Calculating semantic features...
Saved semantic features to ../data/processed/semantic_features.parquet
