# RoBERTa Embedding Generation

Generate semantic embeddings for all unique survey questions using locally cached RoBERTa-large model.

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm.auto import tqdm
import pickle
from pathlib import Path
import time

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

Using device: cpu


## Load Data

In [2]:
print("Loading data...")
df = pd.read_csv('../data/processed/survey_questions_cleaned.csv')

print(f"Total question-survey pairs: {len(df):,}")
print(f"Unique questions: {df['question_id'].nunique():,}")
print(f"Unique surveys: {df['survey'].nunique()}")

Loading data...
Total question-survey pairs: 6,732
Unique questions: 6,296
Unique surveys: 45


In [3]:
# Get unique questions to embed
# Filter out any rows with NaN question_text
print("\nPreparing unique questions...")
unique_questions = df[['question_id', 'question_text']].dropna(subset=['question_text']).drop_duplicates('question_id').sort_values('question_id')

# Ensure question_text is string type
unique_questions['question_text'] = unique_questions['question_text'].astype(str)

print(f"Questions to embed: {len(unique_questions):,}")
print(f"\nSample questions:")
print(unique_questions.head(10))


Preparing unique questions...
Questions to embed: 6,295

Sample questions:
   question_id                                      question_text
0            1  At any time during this school year did you at...
1            2  At any time during this school year did you re...
2            3  At any time during this school year did you re...
3            4  At any time during this school year did you at...
4            5  Did you receive [online schooling or virtual l...
5            6  For this next question, I’m going to read a li...
6            7  How many different schools have you attended t...
7            8  During the time you were homeschooled this sch...
8            9                             What grade are you in?
9           10  In what month did your current school year begin?


## Load RoBERTa Model

In [4]:
# Load model and tokenizer from local cache
model_path = '../models/roberta-large'

print(f"Loading RoBERTa-large from {model_path}...")
start_time = time.time()

tokenizer = RobertaTokenizer.from_pretrained(model_path, local_files_only=True)
print("  ✓ Tokenizer loaded")

model = RobertaModel.from_pretrained(model_path, local_files_only=True)
print("  ✓ Model loaded")

# Move model to GPU if available
model.to(device)
model.eval()
print(f"  ✓ Model moved to {device}")

load_time = time.time() - start_time
print(f"\nModel ready! (loaded in {load_time:.1f}s)")
print(f"Embedding dimension: {model.config.hidden_size}")

Loading RoBERTa-large from ../models/roberta-large...
  ✓ Tokenizer loaded
  ✓ Model loaded
  ✓ Model moved to cpu

Model ready! (loaded in 0.2s)
Embedding dimension: 1024


## Generate Embeddings

In [5]:
def get_embeddings(texts, batch_size=32):
    """
    Generate RoBERTa embeddings for a list of texts.
    Uses mean pooling of last hidden states.
    """
    embeddings = []
    
    # Ensure all texts are strings
    texts = [str(text) if text is not None else '' for text in texts]
    
    num_batches = (len(texts) + batch_size - 1) // batch_size
    
    with torch.no_grad():
        pbar = tqdm(range(0, len(texts), batch_size), 
                    desc="Generating embeddings",
                    total=num_batches,
                    unit="batch")
        
        for i in pbar:
            batch_texts = texts[i:i+batch_size]
            
            # Update progress bar with current batch info
            current_q = min(i + batch_size, len(texts))
            pbar.set_postfix({"questions": f"{current_q}/{len(texts)}"})
            
            # Tokenize
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            )
            
            # Move to device
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)
            
            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Mean pooling
            last_hidden = outputs.last_hidden_state
            attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
            sum_embeddings = torch.sum(last_hidden * attention_mask_expanded, 1)
            sum_mask = torch.clamp(attention_mask_expanded.sum(1), min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            
            # Move to CPU and convert to numpy
            embeddings.append(mean_embeddings.cpu().numpy())
    
    return np.vstack(embeddings)

In [6]:
# Generate embeddings for all unique questions
print(f"\n{'='*70}")
print(f"EMBEDDING GENERATION")
print(f"{'='*70}")
print(f"Total questions: {len(unique_questions):,}")
print(f"Batch size: 32")
print(f"Device: {device}")
print(f"{'='*70}\n")

start_time = time.time()
question_texts = unique_questions['question_text'].tolist()
embeddings = get_embeddings(question_texts, batch_size=32)
embedding_time = time.time() - start_time

print(f"\n{'='*70}")
print(f"COMPLETED!")
print(f"{'='*70}")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Total time: {embedding_time/60:.1f} minutes")
print(f"Average: {embedding_time/len(embeddings):.3f} seconds per question")
print(f"{'='*70}")


EMBEDDING GENERATION
Total questions: 6,295
Batch size: 32
Device: cpu



Generating embeddings:   0%|          | 0/197 [00:00<?, ?batch/s]


COMPLETED!
Embeddings shape: (6295, 1024)
Embedding dimension: 1024
Total time: 6.4 minutes
Average: 0.061 seconds per question


## Verify Embeddings

In [7]:
# Basic sanity checks
print("\nEmbedding statistics:")
print(f"Mean: {embeddings.mean():.4f}")
print(f"Std: {embeddings.std():.4f}")
print(f"Min: {embeddings.min():.4f}")
print(f"Max: {embeddings.max():.4f}")
print(f"\nData quality checks:")
print(f"Any NaN values: {np.isnan(embeddings).any()}")
print(f"Any Inf values: {np.isinf(embeddings).any()}")


Embedding statistics:
Mean: -0.0316
Std: 0.9592
Min: -31.3662
Max: 2.3050

Data quality checks:
Any NaN values: False
Any Inf values: False


In [8]:
# Test similarity between a few questions
from sklearn.metrics.pairwise import cosine_similarity

print("\nTesting semantic similarity...")

# Pick a few sample questions
sample_indices = [0, 100, 200, 300, 400]
sample_texts = [question_texts[i][:80] for i in sample_indices]
sample_embeddings = embeddings[sample_indices]

# Compute pairwise similarities
similarities = cosine_similarity(sample_embeddings)

print("\nSample similarity matrix:")
print("Questions:")
for i, text in enumerate(sample_texts):
    print(f"{i}: {text}...")
print("\nCosine similarities:")
print(similarities.round(3))


Testing semantic similarity...

Sample similarity matrix:
Questions:
0: At any time during this school year did you attend a public or private school in...
1: 20. Has this happened during the past 12 months, that is from [AUTOFILL DATE 1st...
2: ACTION_SECURITY Have you taken self-defensive actions or other security measures...
3: How often does this child’s health insurance offer benefits or cover services th...
4: Has a doctor or other health care provider EVER told you that this child has Aut...

Cosine similarities:
[[1.    0.993 0.993 0.994 0.993]
 [0.993 1.    0.993 0.991 0.992]
 [0.993 0.993 1.    0.993 0.992]
 [0.994 0.991 0.993 1.    0.994]
 [0.993 0.992 0.992 0.994 1.   ]]


## Save Embeddings

In [9]:
print("\nSaving embeddings...")

# Create embeddings directory if it doesn't exist
output_dir = Path('../data/processed/embeddings')
output_dir.mkdir(parents=True, exist_ok=True)

# Save embeddings as numpy array
embeddings_path = output_dir / 'question_embeddings.npy'
np.save(embeddings_path, embeddings)
print(f"  ✓ Embeddings saved to: {embeddings_path}")

# Save question ID mapping
mapping_df = unique_questions[['question_id', 'question_text']].copy()
mapping_df['embedding_index'] = range(len(mapping_df))
mapping_path = output_dir / 'question_id_mapping.csv'
mapping_df.to_csv(mapping_path, index=False)
print(f"  ✓ Question mapping saved to: {mapping_path}")

# Also save as pickle for convenience
embedding_dict = {
    'embeddings': embeddings,
    'question_ids': unique_questions['question_id'].values,
    'question_texts': unique_questions['question_text'].values,
    'model': 'roberta-large',
    'embedding_dim': embeddings.shape[1],
    'num_questions': len(embeddings)
}
pickle_path = output_dir / 'embeddings_with_metadata.pkl'
with open(pickle_path, 'wb') as f:
    pickle.dump(embedding_dict, f)
print(f"  ✓ Pickle file saved to: {pickle_path}")


Saving embeddings...
  ✓ Embeddings saved to: ../data/processed/embeddings/question_embeddings.npy
  ✓ Question mapping saved to: ../data/processed/embeddings/question_id_mapping.csv
  ✓ Pickle file saved to: ../data/processed/embeddings/embeddings_with_metadata.pkl


## Summary

In [10]:
print(f"\n{'='*70}")
print("EMBEDDING GENERATION SUMMARY")
print(f"{'='*70}")
print(f"\nModel: RoBERTa-large")
print(f"Device: {device}")
print(f"\nQuestions embedded: {len(embeddings):,}")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Total size: {embeddings.nbytes / 1024 / 1024:.1f} MB")
print(f"\nProcessing time: {embedding_time/60:.1f} minutes")
print(f"Average: {embedding_time/len(embeddings):.3f}s per question")
print(f"\nOutput files:")
print(f"  - {embeddings_path}")
print(f"  - {mapping_path}")
print(f"  - {pickle_path}")
print(f"\nNext step: Clustering analysis (notebook 03)")
print(f"{'='*70}")


EMBEDDING GENERATION SUMMARY

Model: RoBERTa-large
Device: cpu

Questions embedded: 6,295
Embedding dimension: 1024
Total size: 24.6 MB

Processing time: 6.4 minutes
Average: 0.061s per question

Output files:
  - ../data/processed/embeddings/question_embeddings.npy
  - ../data/processed/embeddings/question_id_mapping.csv
  - ../data/processed/embeddings/embeddings_with_metadata.pkl

Next step: Clustering analysis (notebook 03)
