# Phase 3.0: DeBERTa Embedding Extraction

Extracts DeBERTa-v3-base embeddings for all training segments.

**What this does:**
- Embeds `concept` (song-level text, shared across all tracks in a song) -> 768-dim
- Embeds `lyric_text` (segment-level lyrics, per-segment) -> 768-dim
- Instrumental segments (no lyrics) get a zero vector for `lyric_embedding`

**Prerequisites:**
- `/workspace/data/training_segments_metadata.parquet` (0.6 MB metadata — NOT the 15 GB media file)
- GPU instance (RTX 4090 recommended)

**Output:**
- `/workspace/data/training_data_with_embeddings.parquet` with `concept_embedding` and `lyric_embedding` columns

**Expected:** 11,605 segments, ~5-10 minutes on RTX 4090

## 1. Install Dependencies

In [None]:
# Install dependencies (safetensors avoids torch.load security issue)
!pip install -q transformers pandas pyarrow tqdm sentencepiece tiktoken protobuf safetensors

## 2. Check GPU

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected. Embedding extraction will be slow.")

## 3. Configuration

In [None]:
from pathlib import Path

# Input: metadata parquet (0.6 MB) — do NOT load the media parquet (15 GB binary audio)
INPUT_PATH = Path("/workspace/data/training_segments_metadata.parquet")

OUTPUT_PATH = Path("/workspace/data/training_data_with_embeddings.parquet")

# Model - must match Phase 2/4 training
MODEL_NAME = "microsoft/deberta-v3-base"

# Batch size - reduce if OOM
BATCH_SIZE = 32

# Max sequence length
MAX_LENGTH = 512

print(f"Input: {INPUT_PATH}")
print(f"Output: {OUTPUT_PATH}")
print(f"Model: {MODEL_NAME}")
print(f"Batch size: {BATCH_SIZE}")

## 4. Load Data

In [None]:
import pandas as pd

print(f"Loading {INPUT_PATH}...")
df = pd.read_parquet(INPUT_PATH)

print(f"Loaded {len(df)} segments")
print(f"Columns: {len(df.columns)}")

# Check text columns
has_concept = df['concept'].notna().sum() if 'concept' in df.columns else 0
has_lyrics = df['lyric_text'].notna().sum() if 'lyric_text' in df.columns else 0
print(f"\nConcept coverage: {has_concept}/{len(df)} ({100*has_concept/len(df):.1f}%)")
print(f"Lyric text coverage: {has_lyrics}/{len(df)} ({100*has_lyrics/len(df):.1f}%)")

# Check for existing embeddings
for col in ['concept_embedding', 'lyric_embedding']:
    if col in df.columns:
        print(f"\nWARNING: '{col}' already exists — you may want to skip re-extraction.")

# Verify rainbow colors
if 'rainbow_color' in df.columns:
    print(f"\nColors: {sorted(df['rainbow_color'].unique())}")

## 5. Prepare Text Data

Two text columns to embed:
- **concept**: Song-level narrative (same for all segments of a song). 100% coverage.
- **lyric_text**: Segment-level lyrics. ~90% for vocal tracks, 0% for instrumentals (Green, some Yellow/Red).

In [None]:
# Concept text (song-level) — 100% coverage
concept_texts = df['concept'].fillna("").astype(str).tolist()

# Lyric text (segment-level) — varies by color, 0% for instrumentals
lyric_texts = df['lyric_text'].fillna("").astype(str).tolist()

concept_non_empty = sum(1 for t in concept_texts if len(t.strip()) > 0)
lyric_non_empty = sum(1 for t in lyric_texts if len(t.strip()) > 0)

print(f"Concept texts: {concept_non_empty}/{len(concept_texts)} non-empty")
print(f"Lyric texts:   {lyric_non_empty}/{len(lyric_texts)} non-empty")
print(f"Lyric-empty segments will get zero vectors for lyric_embedding")

## 6. Load Model

In [None]:
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Loading model (using safetensors)...")
# Use safetensors format to avoid torch.load security issue
model = AutoModel.from_pretrained(MODEL_NAME, use_safetensors=True)
model = model.to(device)
model.eval()

print("\nModel loaded!")
print(f"Hidden size: {model.config.hidden_size}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

## 7. Extract Embeddings

Runs two passes: one for `concept`, one for `lyric_text`.
Empty lyric texts produce zero vectors (instrumental segments).

In [None]:
from tqdm import tqdm
import numpy as np

HIDDEN_SIZE = 768  # DeBERTa-v3-base


def extract_embeddings(texts, label=""):
    """Extract CLS token embeddings, using zero vectors for empty texts."""
    embeddings = []
    num_batches = (len(texts) + BATCH_SIZE - 1) // BATCH_SIZE

    print(f"Extracting {label} embeddings for {len(texts)} texts in {num_batches} batches...")

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=label):
            batch_texts = texts[i:i+BATCH_SIZE]

            # Split into non-empty and empty
            batch_embeddings = np.zeros((len(batch_texts), HIDDEN_SIZE), dtype=np.float32)
            non_empty_indices = [j for j, t in enumerate(batch_texts) if len(t.strip()) > 0]

            if non_empty_indices:
                non_empty_texts = [batch_texts[j] for j in non_empty_indices]

                encoded = tokenizer(
                    non_empty_texts,
                    max_length=MAX_LENGTH,
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt"
                )

                input_ids = encoded["input_ids"].to(device)
                attention_mask = encoded["attention_mask"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                non_empty_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()

                for k, j in enumerate(non_empty_indices):
                    batch_embeddings[j] = non_empty_emb[k]

                del input_ids, attention_mask, outputs, non_empty_emb
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            embeddings.extend(batch_embeddings)

    print(f"  Done: {len(embeddings)} embeddings")
    return embeddings


concept_embeddings = extract_embeddings(concept_texts, label="concept")
lyric_embeddings = extract_embeddings(lyric_texts, label="lyric")

# Verify
zero_lyrics = sum(1 for e in lyric_embeddings if np.allclose(e, 0))
print(f"\nZero-vector lyric embeddings (instrumental): {zero_lyrics}/{len(lyric_embeddings)}")

## 8. Add Embeddings to DataFrame

In [None]:
print("Adding embeddings to dataframe...")

df['concept_embedding'] = concept_embeddings
df['lyric_embedding'] = lyric_embeddings

# Flag which segments have real lyric embeddings vs zero vectors
df['has_lyric_embedding'] = [len(t.strip()) > 0 for t in lyric_texts]

print(f"Columns: {len(df.columns)}")
print(f"has_lyric_embedding: {df['has_lyric_embedding'].sum()}/{len(df)}")

## 9. Save to Parquet

In [None]:
print(f"Saving to {OUTPUT_PATH}...")

# Ensure output directory exists
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# Save
df.to_parquet(OUTPUT_PATH)

# Check file size
size_gb = OUTPUT_PATH.stat().st_size / (1024**3)
print(f"\nSaved! File size: {size_gb:.2f} GB")

## 10. Verify Output

In [None]:
print("Verifying saved file...")

df_check = pd.read_parquet(OUTPUT_PATH)

print(f"Rows: {len(df_check)}")
print(f"Columns: {len(df_check.columns)}")

for col in ['concept_embedding', 'lyric_embedding']:
    if col in df_check.columns:
        emb = np.array(df_check[col].iloc[0])
        print(f"\n{col}:")
        print(f"  Shape: {emb.shape}")
        print(f"  Dtype: {emb.dtype}")
        print(f"  Sample: {emb[:5]}")

if 'has_lyric_embedding' in df_check.columns:
    print(f"\nhas_lyric_embedding: {df_check['has_lyric_embedding'].sum()}/{len(df_check)}")

if 'rainbow_color' in df_check.columns:
    print(f"Colors: {sorted(df_check['rainbow_color'].unique())}")

print("\nVerification complete!")

## 11. Rename Output (Optional)

If you want to replace the original file:

In [None]:
# Uncomment to rename
# import shutil
# 
# FINAL_PATH = Path("/workspace/data/training_segments_media.parquet")
# BACKUP_PATH = Path("/workspace/data/training_data_no_embeddings.parquet")
# 
# # Backup original
# if FINAL_PATH.exists():
#     shutil.move(FINAL_PATH, BACKUP_PATH)
#     print(f"Backed up original to {BACKUP_PATH}")
# 
# # Rename new file
# shutil.move(OUTPUT_PATH, FINAL_PATH)
# print(f"Renamed to {FINAL_PATH}")

## 12. Summary

In [None]:
print("=" * 60)
print("PHASE 3.0: EMBEDDING EXTRACTION COMPLETE")
print("=" * 60)
print("")
print(f"Input:  {INPUT_PATH}")
print(f"Output: {OUTPUT_PATH}")
print("")
print(f"Segments: {len(df)}")
print(f"Concept embeddings: {len(concept_embeddings)} x 768")
print(f"Lyric embeddings:   {len(lyric_embeddings)} x 768 ({df['has_lyric_embedding'].sum()} real, {(~df['has_lyric_embedding']).sum()} zero)")
print(f"File size: {size_gb:.2f} GB")
print("")
print("Next: Phase 3.1/3.2 — multimodal fusion (CLAP audio + MIDI CNN + text)")
print("=" * 60)