# Extract Embeddings for e5small and allmpnet

This notebook is designed to run on Kaggle to extract embeddings using GPU acceleration.

**Models to extract:**
- `e5small`: intfloat/multilingual-e5-small (384d)
- `allmpnet`: sentence-transformers/all-mpnet-base-v2 (768d)

**Instructions:**
1. Upload your `total_sentence_train.csv` and `total_sentence_test.csv` to Kaggle as a dataset
2. Enable GPU accelerator (Settings → Accelerator → GPU)
3. Run all cells
4. Download the generated `.npy` files from the output

In [None]:
# Install required packages
!pip install -q sentence-transformers

In [None]:
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from pathlib import Path
import gc

# Check GPU
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# ============================================================================
# CONFIGURATION - Update these paths based on your Kaggle dataset
# ============================================================================

# Update this path to match your Kaggle dataset location
# Example: "/kaggle/input/your-dataset-name/total_sentence_train.csv"
TRAIN_DATA_PATH = "/kaggle/input/amazon-ml-challenge-data/total_sentence_train.csv"
TEST_DATA_PATH = "/kaggle/input/amazon-ml-challenge-data/total_sentence_test.csv"

# Output directory (Kaggle working directory)
OUTPUT_DIR = Path("/kaggle/working")

# Models to extract (the two remaining ones)
MODELS_TO_EXTRACT = {
    "e5small": "intfloat/multilingual-e5-small",        # 384d
    "allmpnet": "sentence-transformers/all-mpnet-base-v2",  # 768d
}

# Batch size (increase if you have more GPU memory)
BATCH_SIZE = 256  # Kaggle T4 can handle larger batches

In [None]:
def extract_embeddings(texts, model_key, model_path, batch_size=256):
    """
    Extract embeddings for a list of texts.
    
    Args:
        texts: List of text strings
        model_key: Short name for the model
        model_path: HuggingFace model path
        batch_size: Batch size for encoding
        
    Returns:
        Embeddings array (n_texts, dim) in float16
    """
    print(f"\n{'='*60}")
    print(f"Loading {model_key}: {model_path}")
    print(f"{'='*60}")
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer(model_path, device=device)
    
    # E5 models require "query: " prefix for best performance
    if "e5" in model_key.lower():
        print("Adding 'query: ' prefix for E5 model...")
        texts = [f"query: {t}" for t in texts]
    
    print(f"Encoding {len(texts):,} texts on {device}...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    
    # Convert to float16 to save storage (halves the file size)
    embeddings = embeddings.astype(np.float16)
    print(f"Shape: {embeddings.shape}, dtype: {embeddings.dtype}")
    
    # Cleanup to free GPU memory
    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return embeddings


def save_embeddings(embeddings, model_key, split, output_dir):
    """Save embeddings to disk as .npy file."""
    output_dir.mkdir(parents=True, exist_ok=True)
    filepath = output_dir / f"{model_key}_{split}.npy"
    np.save(filepath, embeddings)
    
    size_mb = filepath.stat().st_size / (1024 * 1024)
    print(f"✓ Saved: {filepath} ({size_mb:.1f} MB)")
    return filepath

## Load Data

In [None]:
# Load training data
print("Loading training data...")
train_df = pd.read_csv(TRAIN_DATA_PATH)
print(f"Train samples: {len(train_df):,}")
print(f"Columns: {train_df.columns.tolist()}")

# Extract texts
train_texts = train_df["TOTAL_SENTENCE"].fillna("").tolist()
print(f"\nSample text: {train_texts[0][:200]}...")

In [None]:
# Load test data
print("Loading test data...")
test_df = pd.read_csv(TEST_DATA_PATH)
print(f"Test samples: {len(test_df):,}")

# Extract texts
test_texts = test_df["TOTAL_SENTENCE"].fillna("").tolist()

## Extract Embeddings - e5small

In [None]:
# Extract e5small embeddings for TRAIN
e5small_train = extract_embeddings(
    train_texts, 
    "e5small", 
    MODELS_TO_EXTRACT["e5small"],
    batch_size=BATCH_SIZE
)
save_embeddings(e5small_train, "e5small", "train", OUTPUT_DIR)

# Free memory
del e5small_train
gc.collect()

In [None]:
# Extract e5small embeddings for TEST
e5small_test = extract_embeddings(
    test_texts, 
    "e5small", 
    MODELS_TO_EXTRACT["e5small"],
    batch_size=BATCH_SIZE
)
save_embeddings(e5small_test, "e5small", "test", OUTPUT_DIR)

# Free memory
del e5small_test
gc.collect()

## Extract Embeddings - allmpnet

In [None]:
# Extract allmpnet embeddings for TRAIN
allmpnet_train = extract_embeddings(
    train_texts, 
    "allmpnet", 
    MODELS_TO_EXTRACT["allmpnet"],
    batch_size=BATCH_SIZE
)
save_embeddings(allmpnet_train, "allmpnet", "train", OUTPUT_DIR)

# Free memory
del allmpnet_train
gc.collect()

In [None]:
# Extract allmpnet embeddings for TEST
allmpnet_test = extract_embeddings(
    test_texts, 
    "allmpnet", 
    MODELS_TO_EXTRACT["allmpnet"],
    batch_size=BATCH_SIZE
)
save_embeddings(allmpnet_test, "allmpnet", "test", OUTPUT_DIR)

# Free memory
del allmpnet_test
gc.collect()

## Verify Outputs

In [None]:
# List all generated files
print("\n" + "="*60)
print("GENERATED FILES")
print("="*60)

for f in OUTPUT_DIR.glob("*.npy"):
    size_mb = f.stat().st_size / (1024 * 1024)
    arr = np.load(f)
    print(f"{f.name}: shape={arr.shape}, dtype={arr.dtype}, size={size_mb:.1f} MB")

print("\n" + "="*60)
print("DONE! Download the .npy files from the Output tab.")
print("="*60)

In [None]:
# Optional: Quick sanity check - verify embeddings are normalized
print("\nSanity Check - Embedding Norms (should be ~1.0):")
for f in OUTPUT_DIR.glob("*.npy"):
    arr = np.load(f)
    norms = np.linalg.norm(arr.astype(np.float32), axis=1)
    print(f"{f.name}: mean_norm={norms.mean():.4f}, std={norms.std():.4f}")