In [1]:
import os
import math
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
INPUT_CSV = "../datasets/cleaned_leetcode_dataset.csv"
OUTPUT_EMB = "../datasets/leetcode_embeddings.npy"
MAP_CSV = "../datasets/cleaned_index_map.csv"
MODEL_NAME = "all-mpnet-base-v2"
BATCH_SIZE = 64
NORMALIZE = True
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"

In [3]:
print("Using device:", DEVICE)
print("Loading model:", MODEL_NAME)
model = SentenceTransformer(MODEL_NAME, device=DEVICE)

Using device: mps
Loading model: all-mpnet-base-v2


In [4]:
df = pd.read_csv(INPUT_CSV)
texts = df["cleaned_text"].astype(str).tolist()

In [5]:
num = len(texts)
dim = model.get_sentence_embedding_dimension()
embs = np.zeros((num, dim), dtype=np.float32)

In [6]:
for start in tqdm(range(0, num, BATCH_SIZE), desc="Embedding batches"):
    end = min(start + BATCH_SIZE, num)
    batch = texts[start:end]
    with torch.no_grad():
        emb = model.encode(batch, device=DEVICE, show_progress_bar=False, convert_to_numpy=True)
    if NORMALIZE:
        norms = np.linalg.norm(emb, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        emb = emb / norms
    embs[start:end] = emb

Embedding batches: 100%|██████████| 29/29 [01:25<00:00,  2.94s/it]


In [7]:
np.save(OUTPUT_EMB, embs)
df.reset_index()[["index", "cleaned_text"]].to_csv(MAP_CSV, index=False)
print(f"Saved embeddings to {OUTPUT_EMB} shape={embs.shape}")
print(f"Saved index map to {MAP_CSV}")

Saved embeddings to ../datasets/leetcode_embeddings.npy shape=(1825, 768)
Saved index map to ../datasets/cleaned_index_map.csv
