**Pinecone**

In [None]:
import os
import time
import pandas as pd
import tiktoken
from typing import List
from tqdm.auto import tqdm
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI

# ---------------------------------------------------------
# 1. Configuration & Constants
# ---------------------------------------------------------
# Set your keys here (or load from environment variables)
os.environ['OPENAI_API_KEY'] = 'ENTER YOUR API KEY HERE'
os.environ['PINECONE_API_KEY'] = 'ENTER YOUR PINECONE API KEY HERE'

# Assignment configuration
FILE_PATH = 'ted_talks_en.csv'
INDEX_NAME = "ted-rag-index"
EMBEDDING_MODEL = "RPRTHPB-text-embedding-3-small"   # LLMod compatible
EMBEDDING_DIMENSIONS = 1536

# --- Updated Parameters per our conversation ---
MAX_CHUNK_SIZE = 512   # Updated to 512 for better semantic context in narrative text
OVERLAP_RATIO = 0.2    # 20% overlap
BATCH_SIZE = 100       # Efficiency batch size

# ---------------------------------------------------------
# 2. Client Initialization
# ---------------------------------------------------------
# LLMod.ai API-compatible client
client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY'],
    base_url="https://api.llmod.ai/v1" 
)

# Pinecone
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

# ---------------------------------------------------------
# 3. Helper Functions (Tokenization & Safety)
# ---------------------------------------------------------
def get_text_chunks(text: str, chunk_size: int, overlap_ratio: float) -> List[str]:
    """Splits text into chunks based on token count with overlap."""
    if not isinstance(text, str) or not text:
        return []
        
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    total_tokens = len(tokens)
    
    step = int(chunk_size * (1 - overlap_ratio))
    chunks = []
    
    for i in range(0, total_tokens, step):
        chunk_tokens = tokens[i : i + chunk_size]
        chunk_text = encoding.decode(chunk_tokens)
        chunks.append(chunk_text)
    
    return chunks

def safe_str(val):
    """Safely converts value to string, handling NaNs."""
    return str(val) if pd.notna(val) else ""

def safe_int(val):
    """Safely converts value to int, defaulting to 0."""
    try:
        return int(val) if pd.notna(val) else 0
    except:
        return 0

# ---------------------------------------------------------
# 4. Pinecone Index Setup
# ---------------------------------------------------------
def setup_pinecone_index(index_name: str, dimension: int):
    existing_indexes = [i.name for i in pc.list_indexes()]
    
    if index_name not in existing_indexes:
        print(f"Creating new Pinecone index: {index_name}...")
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
        time.sleep(10) # Wait for initialization
    else:
        print(f"Index '{index_name}' already exists. Connecting...")
    
    return pc.Index(index_name)

# ---------------------------------------------------------
# 5. Main ETL Pipeline
# ---------------------------------------------------------
def run_pipeline():
    # Load dataset
    print("Loading dataset...")
    if not os.path.exists(FILE_PATH):
        print(f"Error: File {FILE_PATH} not found.")
        return

    df = pd.read_csv(FILE_PATH)
    print(f"Loaded {len(df)} talks.")

    # Prepare chunks with all metadata
    all_chunks_data = []
    print("Processing text into chunks and extracting metadata...")
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Chunking"):
        # Basic fields
        talk_id = safe_str(row.get('talk_id'))
        title = safe_str(row.get('title'))
        transcript = safe_str(row.get('transcript'))
        
        # Skip if empty transcript
        if not transcript:
            continue
            
        chunks = get_text_chunks(transcript, MAX_CHUNK_SIZE, OVERLAP_RATIO)
        
        for i, chunk_text in enumerate(chunks):
            # Create unique ID
            chunk_id = f"{talk_id}_{i}"
            
            # --- Capture all new fields for Metadata ---
            metadata = {
                "talk_id": talk_id,
                "title": title,
                "chunk_text": chunk_text, 
                "chunk_index": i,
                
                # Extended Metadata from CSV
                "url": safe_str(row.get('url')),
                "speaker": safe_str(row.get('speaker_1')),
                "topics": safe_str(row.get('topics')),
                "views": safe_int(row.get('views')),
                "published_date": safe_str(row.get('published_date')),
                "duration": safe_int(row.get('duration')),
                "event": safe_str(row.get('event')),
                "native_language": safe_str(row.get('native_language'))
            }

            all_chunks_data.append({
                "id": chunk_id,
                "text": chunk_text,
                "metadata": metadata
            })
    
    total_chunks = len(all_chunks_data)
    print(f"Total chunks created: {total_chunks}")

    # Initialize Pinecone
    index = setup_pinecone_index(INDEX_NAME, EMBEDDING_DIMENSIONS)

    # Generate Embeddings and Upsert in Batches
    print("Starting Embedding & Upsert process...")
    for i in tqdm(range(0, total_chunks, BATCH_SIZE), desc="Upserting Batches"):
        batch_slice = all_chunks_data[i : i + BATCH_SIZE]
        batch_texts = [item['text'] for item in batch_slice]
        
        try:
            # 1. Generate Embeddings
            response = client.embeddings.create(
                input=batch_texts,
                model=EMBEDDING_MODEL
            )
            embeddings = [data.embedding for data in response.data]
            
            # 2. Prepare vectors
            vectors_to_upsert = []
            for j, item in enumerate(batch_slice):
                vectors_to_upsert.append({
                    "id": item['id'],
                    "values": embeddings[j],
                    "metadata": item['metadata']
                })
            
            # 3. Upsert to Pinecone
            index.upsert(vectors=vectors_to_upsert)
            
        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")

    print("\nPipeline completed successfully! Data is ready in Pinecone.")

# ---------------------------------------------------------
# Execution
# ---------------------------------------------------------
if __name__ == "__main__":
    run_pipeline()

Loading dataset...


  df = pd.read_csv(FILE_PATH)


Loaded 4016 talks.
Processing text into chunks and extracting metadata...


Chunking: 100%|██████████| 4016/4016 [00:07<00:00, 550.36it/s]


Total chunks created: 23549
Creating new Pinecone index: ted-rag-index...
Starting Embedding & Upsert process...


Upserting Batches:  99%|█████████▉| 234/236 [31:00<00:17,  8.81s/it]

Error processing batch starting at index 23300: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sun, 14 Dec 2025 11:31:43 GMT', 'Content-Type': 'application/json', 'Content-Length': '173', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '2189', 'x-pinecone-request-id': '539710761821096387', 'x-envoy-upstream-service-time': '32', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector ID must be ASCII, but got 'a lot of cases in children — is that because the children get infected but they don't get symptoms_0'","details":[]}



Upserting Batches: 100%|██████████| 236/236 [31:11<00:00,  7.93s/it]



Pipeline completed successfully! Data is ready in Pinecone.
