In [8]:
import os
import json
import time
import uuid
from pathlib import Path
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

# Transcript Processing Pipeline

In [12]:
def preprocess_transcripts(transcripts_dir="transcripts", output_json="pinecone_data.json"):
    transcripts_path = Path(transcripts_dir)
    if not transcripts_path.exists():
        raise FileNotFoundError(f"Transcript folder not found: {transcripts_path}")
    
    transcript_files = list(transcripts_path.glob("*.txt"))
    if not transcript_files:
        raise FileNotFoundError(f"No .txt files found in {transcripts_path}")

    # Map video IDs to topics
    topic_map = {
        "4jEad6zxaFk": "The Great Pyramids",
        "BR2ZMj3o5EU": "The Great Pyramids",
        "9yD9GxzKd_Q": "The Great Pyramids",
        "vJucA4FOTSI": "The Great Pyramids",
        "25sBBCPeRvY": "The Great Pyramids",
        "k3QiW0gEpYM": "The Great Pyramids",
        "Fo8issWL-tI": "The Great Pyramids",
        "Us2v5O5EkZM": "Roman Forum",
        "OWHkpLVskk": "Roman Forum",
        "k4P5W1DKTBI": "Roman Forum",
        "evmyQGmuzqA": "Roman Forum",
        "MGd3BVW3vYs": "Roman Forum",
        "CVyuqIoB7qg": "Roman Forum",
        "zxKPjD8urG4": "Roman Forum",
        "ClUdLS-UAZ0": "Ancient Greece",
        "Y_B0bh7MXgI": "Ancient Greece",
        "OmH4FDs3yl0": "Ancient Greece",
        "sFU-rJXQlxI": "Ancient Greece",
        "Mk-OyRI7c7Q": "Ancient Greece",
        "4M-4M4LyUB0": "Ancient Greece",
        "VB_WUMtdjpU": "Machu Picchu",
        "JMAKRKkdOlw": "Machu Picchu",
        "Z94cpOLx_JQ": "Machu Picchu",
        "ZR_mEpS4Tvw": "Machu Picchu",
        "naEFqvUiU0c": "Machu Picchu",
        "Q-mAWItV2q0": "Mesopotamia",
        "fBOD64ow5eo": "Mesopotamia",
        "9q7r0XZUyAk": "Sangam Tamil",
        "GgalhomEIcc": "Sangam Tamil",
    }

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    all_chunks = []

    for file_path in transcript_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()

        if not content:
            continue

        # Extract video ID from filename (assuming it's before first underscore)
        video_id = file_path.stem.split('_')[0]
        video_title = file_path.stem.replace('_transcript', '').replace('_', ' ')
        topic = topic_map.get(video_id, "Unknown")

        doc = Document(
            page_content=content,
            metadata={
                "source": file_path.name,
                "video_title": video_title,
                "file_path": str(file_path),
                "topic": topic
            }
        )

        chunks = text_splitter.split_documents([doc])

        for i, chunk in enumerate(chunks):
            chunk.metadata.update({
                "chunk_id": f"{file_path.stem}_chunk_{i}",
                "chunk_index": i,
                "total_chunks": len(chunks)
            })

        all_chunks.extend(chunks)

    pinecone_records = [{
        "id": str(uuid.uuid4()),
        "text": chunk.page_content,
        "metadata": chunk.metadata
    } for chunk in all_chunks]

    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(pinecone_records, f, indent=2, ensure_ascii=False)

    print(f"✅ Preprocessed {len(transcript_files)} files into {len(all_chunks)} chunks")
    print(f"📁 Saved to {output_json}")
                  
            


# Run the full pipeline
run_pipeline()

✅ Preprocessed 28 files into 1437 chunks
📁 Saved to pinecone_data.json
