# *Imports*

In [14]:
import os, json, math, time, hashlib
import numpy as np
import pandas as pd
import requests
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
load_dotenv()

True

# *Configurations*

In [15]:
BASE_URL = "https://api.llmod.ai/v1"  
EMBED_MODEL = "RPRTHPB-text-embedding-3-small"
EMBED_DIMS = 1536

In [16]:
# Chunking restrictions
MAX_TOKENS = 2048
MAX_OVERLAP_RATIO = 0.30

# Heuristic since we may not have a tokenizer:
# rough token ≈ word * 1.3, so max_words ~ MAX_TOKENS / 1.3
MAX_WORDS = int(MAX_TOKENS / 1.3)
OVERLAP_WORDS = int(MAX_WORDS * MAX_OVERLAP_RATIO)

# Budget-safe dev: start small, then scale up
START_WITH_N_TALKS = 5  # set None to do all later

# Output artifact files (cached embeddings)
OUT_META_JSONL = "ted_chunks_meta.jsonl"
OUT_EMB_NPY = "ted_chunks_embeds.npy"
OUT_IDMAP_JSON = "ted_chunks_idmap.json"  # to avoid duplicates across runs

# API key
LLMOD_API_KEY = os.getenv("LLMOD_API_KEY")  # or paste here: "sk-...."
if not LLMOD_API_KEY:
    raise ValueError("Set env var LLMOD_API_KEY or paste it into LLMOD_API_KEY.")

HEADERS = {"Authorization": f"Bearer {LLMOD_API_KEY}", "Content-Type": "application/json"}


In [17]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# PINECONE_ENV = os.getenv("PINECONE_ENV") or "us-east-1"  # example
INDEX_NAME = "ted"

EMBED_DIMS = 1536

pc = Pinecone(api_key=PINECONE_API_KEY)

# *Chunks*

In [4]:
def stable_hash(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

def approx_word_chunks(text: str, max_words: int, overlap_words: int):
    """
    Word-based chunker to approximate token limits.
    Ensures overlap <= 30% by construction if overlap_words <= 0.3*max_words.
    """
    words = text.split()
    if not words:
        return []

    chunks = []
    step = max_words - overlap_words
    if step <= 0:
        raise ValueError("overlap_words too large; step must be > 0.")

    start = 0
    while start < len(words):
        end = min(start + max_words, len(words))
        chunk_words = words[start:end]
        chunk_text = " ".join(chunk_words).strip()
        if chunk_text:
            chunks.append(chunk_text)
        if end == len(words):
            break
        start += step
    return chunks

def embed_texts_batch(texts, model=EMBED_MODEL, dims=EMBED_DIMS, max_retries=6):
    """
    Calls llmod.ai embeddings endpoint (OpenAI-compatible).
    Uses exponential backoff on transient errors.
    """
    url = f"{BASE_URL}/embeddings"
    payload = {
        "model": model,
        "input": texts,
        "dimensions": dims,  # keep aligned with the model default (1536)
    }

    for attempt in range(max_retries):
        try:
            r = requests.post(url, headers=HEADERS, data=json.dumps(payload), timeout=60)
            if r.status_code == 200:
                data = r.json()
                # OpenAI-style: data["data"] is list of {embedding: [...]}
                embs = [np.array(item["embedding"], dtype=np.float32) for item in data["data"]]
                return np.vstack(embs)
            # Retry on rate limit / transient server issues
            if r.status_code in (429, 500, 502, 503, 504):
                sleep_s = min(2 ** attempt, 30)
                time.sleep(sleep_s)
                continue
            # Otherwise: hard fail with details
            raise RuntimeError(f"Embeddings error {r.status_code}: {r.text[:500]}")
        except requests.RequestException as e:
            sleep_s = min(2 ** attempt, 30)
            time.sleep(sleep_s)
            last_err = e
    raise RuntimeError(f"Embeddings failed after retries. Last error: {last_err}")

In [18]:
# 1) Load the dataset CSV
# If your dataset is large, consider reading only needed columns first.
# Example assumes you already have it locally as ted.csv:
CSV_PATH = "ted_talks_en.csv"  # <-- change this

df = pd.read_csv(CSV_PATH)

# Optional: start small for budget
if START_WITH_N_TALKS is not None:
    df = df.head(START_WITH_N_TALKS).copy()

df.columns

Index(['talk_id', 'title', 'speaker_1', 'all_speakers', 'occupations',
       'about_speakers', 'views', 'recorded_date', 'published_date', 'event',
       'native_lang', 'available_lang', 'comments', 'duration', 'topics',
       'related_talks', 'url', 'description', 'transcript'],
      dtype='object')

In [6]:
# 2) Build chunk records (metadata + text)
records = []
for _, row in df.iterrows():
    talk_id = str(row.get("talk_id", ""))
    title = str(row.get("title", ""))
    transcript = str(row.get("transcript", "") or "")
    if not transcript.strip():
        continue

    chunks = approx_word_chunks(transcript, max_words=MAX_WORDS, overlap_words=OVERLAP_WORDS)

    for ci, chunk_text in enumerate(chunks):
        chunk_uid = stable_hash(talk_id + "|" + title + "|" + str(ci) + "|" + chunk_text)
        records.append({
            "chunk_uid": chunk_uid,
            "talk_id": talk_id,
            "title": title,
            "chunk_id": ci,
            "text": chunk_text,
        })

len(records), records[0].keys() if records else None

(14, dict_keys(['chunk_uid', 'talk_id', 'title', 'chunk_id', 'text']))

In [19]:
# 3) Load previous cache (so we don't re-embed)
if os.path.exists(OUT_IDMAP_JSON):
    with open(OUT_IDMAP_JSON, "r", encoding="utf-8") as f:
        seen = set(json.load(f))
else:
    seen = set()

new_records = [r for r in records if r["chunk_uid"] not in seen]
print("Total records:", len(records))
print("Already embedded:", len(records) - len(new_records))
print("To embed now:", len(new_records))

Total records: 0
Already embedded: 0
To embed now: 0


In [20]:
# 4) Embed new chunks in batches + append to cache
BATCH_SIZE = 64  # you can tune; smaller batches can be safer

# Load existing embeddings if present
if os.path.exists(OUT_EMB_NPY):
    old_embs = np.load(OUT_EMB_NPY)
else:
    old_embs = None

new_emb_list = []
to_write_meta = []

for i in range(0, len(new_records), BATCH_SIZE):
    batch = new_records[i:i+BATCH_SIZE]
    texts = [b["text"] for b in batch]
    embs = embed_texts_batch(texts)

    # Sanity check dimensions
    if embs.shape[1] != EMBED_DIMS:
        raise ValueError(f"Unexpected embedding dims: {embs.shape[1]} (expected {EMBED_DIMS})")

    new_emb_list.append(embs)
    to_write_meta.extend(batch)

    print(f"Embedded {min(i+BATCH_SIZE, len(new_records))}/{len(new_records)}")

# Append embeddings
if new_emb_list:
    new_embs = np.vstack(new_emb_list)
    all_embs = new_embs if old_embs is None else np.vstack([old_embs, new_embs])
else:
    all_embs = old_embs if old_embs is not None else np.zeros((0, EMBED_DIMS), dtype=np.float32)

# Save embeddings matrix
np.save(OUT_EMB_NPY, all_embs)

# Append metadata (jsonl)
if to_write_meta:
    with open(OUT_META_JSONL, "a", encoding="utf-8") as f:
        for r in to_write_meta:
            f.write(json.dumps({
                "chunk_uid": r["chunk_uid"],
                "talk_id": r["talk_id"],
                "title": r["title"],
                "chunk_id": r["chunk_id"],
                "text": r["text"],
            }, ensure_ascii=False) + "\n")

# Update seen set and save
for r in to_write_meta:
    seen.add(r["chunk_uid"])
with open(OUT_IDMAP_JSON, "w", encoding="utf-8") as f:
    json.dump(sorted(list(seen)), f)

print("Done.")
print("Embeddings shape:", all_embs.shape)
print("Saved:", OUT_EMB_NPY, OUT_META_JSONL, OUT_IDMAP_JSON)


Done.
Embeddings shape: (14, 1536)
Saved: ted_chunks_embeds.npy ted_chunks_meta.jsonl ted_chunks_idmap.json


# Pinecone Index

In [11]:
existing = [idx["name"] for idx in pc.list_indexes()]

if INDEX_NAME not in existing:
    pc.create_index(
        name=INDEX_NAME,
        dimension=EMBED_DIMS,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )

index = pc.Index(INDEX_NAME)
print("Ready:", INDEX_NAME)

Ready: ted


In [23]:
# ====== CONTINUE HERE: Upsert cached vectors into Pinecone ======

OUT_PINECONE_IDMAP_JSON = "pinecone_upserted_ids.json"

# Load cached embeddings
if not os.path.exists(OUT_EMB_NPY):
    raise FileNotFoundError(f"Missing embeddings file: {OUT_EMB_NPY}")
embs = np.load(OUT_EMB_NPY)

# Load cached metadata jsonl (must be same order as embeddings were appended)
if not os.path.exists(OUT_META_JSONL):
    raise FileNotFoundError(f"Missing metadata file: {OUT_META_JSONL}")

meta_rows = []
with open(OUT_META_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            meta_rows.append(json.loads(line))

if len(meta_rows) != embs.shape[0]:
    raise ValueError(
        f"Mismatch: meta rows={len(meta_rows)} vs embeddings={embs.shape[0]}. "
        "These must be aligned (same append order)."
    )

print("Loaded embeddings:", embs.shape)
print("Loaded meta rows:", len(meta_rows))

# Load already-upserted ids (local cache to avoid re-upserting)
if os.path.exists(OUT_PINECONE_IDMAP_JSON):
    with open(OUT_PINECONE_IDMAP_JSON, "r", encoding="utf-8") as f:
        upserted = set(json.load(f))
else:
    upserted = set()

print("Already upserted IDs (local cache):", len(upserted))

def safe_metadata(m: dict) -> dict:
    """
    Pinecone metadata must be JSON-serializable, typically simple types.
    Also keep text size reasonable (metadata size limits exist).
    """
    text = m.get("text", "") or ""
    # Keep a snippet to avoid oversized metadata; adjust if you want.
    text_snippet = text[:2000]

    return {
        "talk_id": str(m.get("talk_id", "")),
        "title": str(m.get("title", "")),
        "chunk_id": int(m.get("chunk_id", 0)),
        "text": text_snippet,
    }

# Build items to upsert (skip already upserted IDs)
items = []
for i, m in enumerate(meta_rows):
    _id = m["chunk_uid"]
    if _id in upserted:
        continue

    vec = embs[i]
    if vec.shape[0] != EMBED_DIMS:
        raise ValueError(f"Bad dims at row {i}: got {vec.shape[0]} expected {EMBED_DIMS}")

    items.append((_id, vec.tolist(), safe_metadata(m)))

print("To upsert now:", len(items))

# Upsert in batches
UPSERT_BATCH = 100  # safe default; you can set 50 if you prefer
for start in range(0, len(items), UPSERT_BATCH):
    batch = items[start:start+UPSERT_BATCH]
    index.upsert(vectors=batch)

    # update local cache
    for _id, _, _ in batch:
        upserted.add(_id)

    if (start // UPSERT_BATCH) % 5 == 0:
        print(f"Upserted {min(start+UPSERT_BATCH, len(items))}/{len(items)}")

# Persist local upsert cache
with open(OUT_PINECONE_IDMAP_JSON, "w", encoding="utf-8") as f:
    json.dump(sorted(list(upserted)), f)

print("✅ Upsert complete.")
print("Local upsert cache saved to:", OUT_PINECONE_IDMAP_JSON)

# Optional: quick sanity check by describing index stats
try:
    stats = index.describe_index_stats()
    print("Index stats:", stats)
except Exception as e:
    print("describe_index_stats failed (non-fatal):", repr(e))


Loaded embeddings: (14, 1536)
Loaded meta rows: 14
Already upserted IDs (local cache): 0
To upsert now: 14
Upserted 14/14
✅ Upsert complete.
Local upsert cache saved to: pinecone_upserted_ids.json
Index stats: {'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '184',
                                    'content-type': 'application/json',
                                    'date': 'Fri, 26 Dec 2025 22:47:39 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '91',
                                    'x-pinecone-request-id': '1732630235628279066',
                                    'x-pinecone-request-latency-ms': '91',
                                    'x-pinecone-response-duration-ms': '93'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'n