# Embedding Extraction for Phase 4 Training

Extracts DeBERTa-v3-base embeddings for all segments.

**Prerequisites:**
- `/workspace/data/training_data_embedded.parquet` (or `base_manifest_db.parquet`)
- GPU instance (RTX 4090 recommended)

**Output:**
- Parquet file with 768-dim `embedding` column added

**Expected time:** ~5-10 minutes on RTX 4090

## 1. Install Dependencies

In [None]:
# Install dependencies (safetensors avoids torch.load security issue)
!pip install -q transformers pandas pyarrow tqdm sentencepiece tiktoken protobuf safetensors

## 2. Check GPU

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected. Embedding extraction will be slow.")

## 3. Configuration

In [None]:
from pathlib import Path

# Input file - try the large file first, fall back to base manifest
INPUT_PATH = Path("/workspace/data/training_data_embedded.parquet")
if not INPUT_PATH.exists():
    INPUT_PATH = Path("/workspace/data/base_manifest_db.parquet")
    
OUTPUT_PATH = Path("/workspace/data/training_data_with_embeddings.parquet")

# Model - must match Phase 2 training
MODEL_NAME = "microsoft/deberta-v3-base"

# Batch size - reduce if OOM
BATCH_SIZE = 32

# Max sequence length
MAX_LENGTH = 512

print(f"Input: {INPUT_PATH}")
print(f"Output: {OUTPUT_PATH}")
print(f"Model: {MODEL_NAME}")
print(f"Batch size: {BATCH_SIZE}")

## 4. Load Data

In [None]:
import pandas as pd

print(f"Loading {INPUT_PATH}...")
df = pd.read_parquet(INPUT_PATH)

print(f"Loaded {len(df)} segments")
print(f"\nColumns: {len(df.columns)}")

# Check if embeddings already exist
if 'embedding' in df.columns:
    print("\nWARNING: 'embedding' column already exists!")
    print(f"Shape: {df['embedding'].iloc[0].shape}")
    print("You may want to skip re-extraction.")
else:
    print("\nNo embeddings found - will extract.")

# Check rainbow columns
rainbow_cols = [c for c in df.columns if 'rainbow_color' in c]
print(f"\nRainbow columns: {rainbow_cols}")

## 5. Prepare Text Data

In [None]:
# Determine text column - prefer 'concept' over 'lyric_text'
if 'concept' in df.columns and df['concept'].notna().sum() > 0:
    text_col = 'concept'
elif 'lyric_text' in df.columns:
    text_col = 'lyric_text'
else:
    raise ValueError("No text column found!")

print(f"Using text column: '{text_col}'")

# Get texts, replacing NaN with empty string
texts = df[text_col].fillna("").astype(str).tolist()

# Stats
non_empty = sum(1 for t in texts if len(t.strip()) > 0)
print(f"Total texts: {len(texts)}")
print(f"Non-empty: {non_empty}")
print(f"Empty: {len(texts) - non_empty}")

# Preview
print("\nSample texts:")
for i in [0, len(texts)//2, -1]:
    text = texts[i][:100] + "..." if len(texts[i]) > 100 else texts[i]
    print(f"  [{i}]: {text}")

## 6. Load Model

In [None]:
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Loading model (using safetensors)...")
# Use safetensors format to avoid torch.load security issue
model = AutoModel.from_pretrained(MODEL_NAME, use_safetensors=True)
model = model.to(device)
model.eval()

print("\nModel loaded!")
print(f"Hidden size: {model.config.hidden_size}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

## 7. Extract Embeddings

In [None]:
from tqdm import tqdm
import numpy as np

embeddings = []
num_batches = (len(texts) + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Extracting embeddings for {len(texts)} texts in {num_batches} batches...")

with torch.no_grad():
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Extracting"):
        batch_texts = texts[i:i+BATCH_SIZE]
        
        # Tokenize
        encoded = tokenizer(
            batch_texts,
            max_length=MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Move to device
        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # CLS token embedding (first token)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
        
        # Free GPU memory
        del input_ids, attention_mask, outputs, batch_embeddings
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

print(f"\nExtracted {len(embeddings)} embeddings")
print(f"Embedding shape: {embeddings[0].shape}")

## 8. Add Embeddings to DataFrame

In [None]:
print("Adding embeddings to dataframe...")

# Add as list of arrays (parquet-compatible)
df['embedding'] = embeddings

print(f"DataFrame now has {len(df.columns)} columns")
print(f"Embedding column dtype: {df['embedding'].dtype}")

## 9. Save to Parquet

In [None]:
print(f"Saving to {OUTPUT_PATH}...")

# Ensure output directory exists
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# Save
df.to_parquet(OUTPUT_PATH)

# Check file size
size_gb = OUTPUT_PATH.stat().st_size / (1024**3)
print(f"\nSaved! File size: {size_gb:.2f} GB")

## 10. Verify Output

In [None]:
print("Verifying saved file...")

df_check = pd.read_parquet(OUTPUT_PATH)

print(f"Rows: {len(df_check)}")
print(f"Columns: {len(df_check.columns)}")
print(f"Has 'embedding': {'embedding' in df_check.columns}")

if 'embedding' in df_check.columns:
    emb = df_check['embedding'].iloc[0]
    print(f"Embedding shape: {np.array(emb).shape}")
    print(f"Embedding dtype: {np.array(emb).dtype}")
    print(f"Embedding sample: {np.array(emb)[:5]}")

# Check rainbow columns preserved
rainbow_cols = [c for c in df_check.columns if 'rainbow_color' in c]
print(f"\nRainbow columns: {rainbow_cols}")

print("\nVerification complete!")

## 11. Rename Output (Optional)

If you want to replace the original file:

In [None]:
# Uncomment to rename
# import shutil
# 
# FINAL_PATH = Path("/workspace/data/training_data_embedded.parquet")
# BACKUP_PATH = Path("/workspace/data/training_data_no_embeddings.parquet")
# 
# # Backup original
# if FINAL_PATH.exists():
#     shutil.move(FINAL_PATH, BACKUP_PATH)
#     print(f"Backed up original to {BACKUP_PATH}")
# 
# # Rename new file
# shutil.move(OUTPUT_PATH, FINAL_PATH)
# print(f"Renamed to {FINAL_PATH}")

## 12. Summary

In [None]:
print("=" * 60)
print("EMBEDDING EXTRACTION COMPLETE")
print("=" * 60)
print("")
print(f"Input:  {INPUT_PATH}")
print(f"Output: {OUTPUT_PATH}")
print("")
print(f"Segments: {len(df)}")
print("Embedding dim: 768")
print(f"File size: {size_gb:.2f} GB")
print("")
print("Next step: Run Phase 4 regression training")
print("=" * 60)