### Chunking & Embedding: Transcript Preparation for YouTube QA Bot
This notebook processes the transcript of a YouTube video into chunks, adds metadata, and stores them in a vector database using FAISS for semantic search.


### Load Transcript Text File


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("../data/eleo_transcript.txt", "r") as f:
    transcript_text = f.read()


### Split Transcript into Chunks with Overlap


In [None]:
# split chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_text(transcript_text)

### Add Metadata to Each Chunk


In [None]:
# Define the video_id for metadata
video_id = "eleo_video_001"

# Add metadata
chunk_docs = []
for i, chunk in enumerate(chunks):
    chunk_docs.append({
        "content": chunk,
        "metadata": {
            "video_id": video_id,
            "chunk_index": i,
            "source": "Eleo's Corner"
        }
    })

print(f"✅ Created {len(chunks)} chunks.")


✅ Created 26 chunks.


### Save Chunked Data with Metadata as JSON


In [53]:
import os
import json

# Save to chunks file
os.makedirs("../data/chunks", exist_ok=True)
with open("../data/chunks/eleo_chunks_meta.json", "w") as f:
    json.dump(chunk_docs, f, indent=2)


print("✅ Chunks saved to ../data/chunks/eleo_chunks_meta.json")


✅ Chunks saved to ../data/chunks/eleo_chunks_meta.json


### Embed Chunks and Save to FAISS Vector Store


In [54]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import json
import os
from dotenv import load_dotenv

# Load API key
load_dotenv()

# Convert chunks into LangChain Document objects (required format)
documents = [
    Document(page_content=chunk["content"], metadata=chunk["metadata"])
    for chunk in chunk_docs
]

# Set up the OpenAI Embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Save it locally so you can reuse it later
os.makedirs("../data/vectorstores", exist_ok=True)
db.save_local("../data/vectorstores/eleo_faiss")

print("✅ Chunks embedded and saved to FAISS at: ../data/vectorstores/eleo_faiss")


✅ Chunks embedded and saved to FAISS at: ../data/vectorstores/eleo_faiss
