### Chunking & Embedding: Transcript Preparation for YouTube QA Bot
This notebook processes the transcript of a YouTube video into chunks, adds metadata, and stores them in a vector database using FAISS for semantic search.


In [1]:
import sys
sys.path.append('../')

### Load Transcript Text File


In [2]:
from utils.chunking import load_transcript
transcript_path = "../data/eleo_transcript.txt"
transcript_text = load_transcript(transcript_path)


### Split Transcript into Chunks with Overlap


In [3]:
from utils.chunking import split_into_chunks
chunks = split_into_chunks(transcript_text, chunk_size=500, chunk_overlap=100)

### Add Metadata to Each Chunk


Extract general metadata before adding to a chunk


In [None]:
#from utils.metadata_extract import extract_youtube_metadata
#youtube_url = "https://www.youtube.com/watch?v=SN-vBnWj6e8&list=PPSV"
#meta = extract_youtube_metadata(youtube_url)



In [None]:
from utils.chunking import add_chunk_metadata

video_id = "eleo_video_001"
transcript_title = "Eleo YouTube Video"
author = "Eleo Channel"
tags = ["YouTube", "AI", "Transcript"]

chunk_docs = add_chunk_metadata(chunks, video_id, transcript_title, author, tags)


### Save Chunked Data with Metadata as JSON


In [6]:
from utils.chunking import save_chunks_to_json
output_path = "../data/eleo_transcript_chunks.json"
save_chunks_to_json(chunk_docs, output_path)   

### Save chunks to Langchain document
Wraping each chunk in a `Document` object with metadata.

In [7]:
from utils.embedding import convert_to_documents
documents = convert_to_documents(chunk_docs)

### Embed Chunks and Save to FAISS Vector Store


In [8]:
from utils.embedding import embed_documents_with_openai, save_vectorstore_faiss
faiss_index = embed_documents_with_openai(documents, model="text-embedding-3-small")
save_vectorstore_faiss(faiss_index, "../data/vectorstores/eleo_faiss")

  embedding_model = OpenAIEmbeddings(model=model)


In [9]:
from utils.embedding import load_vectorstore_faiss
faiss_index = load_vectorstore_faiss("../data/vectorstores/eleo_faiss")