# Quick Start

In [2]:
import csv

# Load sample data (a restaurant menu of items)
with open('menu_items.csv') as file:
    lines = csv.reader(file)

    # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
    documents = []

    # Store the corresponding menu item IDs in this array.
    metadata = []

    # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
    ids = []
    id = 1

    for i, line in enumerate(lines):
        if i == 0:
            continue

        documents.append(line[1])
        metadata.append({"item_id": line[0]})
        ids.append(str(id))
        id += 1

In [3]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client()

# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="./chromadb_data")

In [9]:
# Select the embedding model to use.
# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
sentence_transformers_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.create_collection(
    name="menu_items_collection",
    embedding_function=sentence_transformers_ef
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [10]:
collection.add(
    documents=documents,
    metadatas=metadata,
    ids=ids
)

In [12]:
# Query the vector database.
query_results = collection.query(
    query_texts=["chocolate cake"],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)
print(query_results["documents"])

[['Fried Cake', 'Grilled Cake', 'Grilled Cake', 'Roasted Cake', 'Roasted Cake']]


In [13]:
# Query the vector database.
query_results = collection.query(
    query_texts=["donut"],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)
print(query_results["documents"])

[['Fried Smoothie', 'Fried Smoothie', 'Fried Cake', 'Spicy Cake', 'Spicy Cake']]


In [14]:
# Query the vector database.
query_results = collection.query(
    query_texts=["Chicken"],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)
print(query_results["documents"])

[['Fried Chicken', 'Sweet Chicken', 'Sweet Chicken', 'Spicy Chicken', 'Crispy Chicken']]


# Youtube Notes

In [54]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
from sentence_transformers import SentenceTransformer
import re

import google.generativeai as genai
from chromadb.utils import embedding_functions

import os
from dotenv import load_dotenv

## Simpan ke Vector Database (ChromaDB)

In [43]:
load_dotenv()  
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)

genai_model = genai.GenerativeModel('gemini-2.5-flash')

chroma_client = chromadb.PersistentClient(path="./chromadb_youtube_data")

gemini_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
    api_key=GEMINI_API_KEY
)

chroma_collection = chroma_client.get_or_create_collection(
    name="youtube_transcripts_collection",
    embedding_function=gemini_ef
)

## Extract Youtube Video

In [55]:
def extract_youtube_video_id(url: str) -> str:
    """
    Extracts the YouTube video ID from a given URL.
    Works for 'youtu.be', 'youtube.com/watch?v=', and 'youtube.com/shorts/' formats.
    """
    # Match standard formats
    patterns = [
        r"youtu\.be/([a-zA-Z0-9_-]{11})",
        r"v=([a-zA-Z0-9_-]{11})",
        r"shorts/([a-zA-Z0-9_-]{11})"
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    
    raise ValueError(f"Invalid YouTube URL: {url}")

In [None]:
# Some sample YouTube videos:
# https://youtu.be/IdLSZEYlWVo
# https://youtu.be/tL-wnMVyTQI
# https://youtu.be/etSdP9CFmko
# https://youtu.be/rgRIZDsEwCk
# https://youtu.be/_EA-74yr5D4

url = "https://www.youtube.com/watch?v=hQH4-5o0BMM"
yt_video_id = extract_youtube_video_id(url)
yt_video_id = 'hQH4-5o0BMM'

# Adjust prompt as needed
prompt = "Extract key notes from video transcript: "

## Create Embedding (Vector Representation)

In [46]:
# ----------------------------
# Use SentenceTransformer instead of Gemini embedding
# ----------------------------
from chromadb.utils import embedding_functions

st_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"   # ✅ cukup string, bukan objek model
)

chroma_collection = chroma_client.get_or_create_collection(
    name="youtube_transcripts_collection",
    embedding_function=st_ef
)

# Get Transcript From Youtube

In [47]:
# ----------------------------
# Fetch transcript
# ----------------------------
yt_video_id = 'hQH4-5o0BMM'

ytt_api = YouTubeTranscriptApi()
fetched_transcript = ytt_api.fetch(yt_video_id, languages=['en', 'en-US', 'en-GB'])

formatter = TextFormatter()
transcript = formatter.format_transcript(fetched_transcript)

with open("temp_transcript.txt", "w", encoding="utf-8") as f:
    f.write(transcript)


## Summarize Gemine

In [48]:
# ----------------------------
# Summarize using Gemini (optional)
# ----------------------------
prompt = "Extract key notes from video transcript: "
response = genai_model.generate_content(prompt + transcript)

summary = response.text

with open("summary.txt", "w", encoding="utf-8") as f:
    f.write(summary)

In [49]:
# ----------------------------
# Upsert into ChromaDB
# ----------------------------
chroma_collection.upsert(
    documents=[summary],
    ids=[yt_video_id]
)

print("Transcript summarized and stored in ChromaDB successfully.")

Transcript summarized and stored in ChromaDB successfully.


## Validate Data ChromaDB

In [50]:
# ----------------------------
# Validate Data in Chroma
# ----------------------------
result = chroma_collection.get(yt_video_id, include=['documents'])
print("=== Stored Document ===")
print(result)

=== Stored Document ===
{'ids': ['hQH4-5o0BMM'], 'embeddings': None, 'documents': ['Here are the key notes from the video transcript:\n\n**Recipe Title:** Spaghetti and Meat Sauce (Claimed to be the "only one you\'ll ever make again" and "best ever")\n\n**Key Features:**\n*   **Time:** 30 minutes total preparation and cooking.\n*   **Health:** Healthy, loaded with veggies.\n*   **Flavor:** Delicious, full of flavor, deep and rich taste.\n*   **Convenience:** Easy weeknight dinner.\n*   **Secret:** Veggies chopped fine enough that "allergic to vegetables" family members won\'t notice.\n*   **Technique:** Pasta finishes cooking *in the sauce* for maximum flavor.\n\n**Ingredients:**\n\n*   **Veggies:**\n    *   Onion (rough chop)\n    *   Garlic (5 cloves)\n    *   Carrot (large, peeled or not, good wash)\n    *   Celery (similar size chunks, including leaves)\n*   **Fat:** Olive oil (generous "glug")\n*   **Meats:**\n    *   1 lb Ground Beef (85/15 fat content recommended for balance)\n 

## Query & Search

In [51]:
# ----------------------------
# Query the Vector DB
# ----------------------------
query_text = "How much beef do I need for the beef ribs recipe?"
n_results = 5

results = chroma_collection.query(
    query_texts=[query_text],
    n_results=n_results,
    include=['documents', 'distances', 'metadatas'],
)

for i in range(len(results['ids'][0])):
    vid = results["ids"][0][i]
    doc = results['documents'][0][i]

    print("************************************************************************")
    print(f"{i+1}.  https://youtu.be/{vid}")
    print("************************************************************************")
    print(doc)
    print()

************************************************************************
1.  https://youtu.be/hQH4-5o0BMM
************************************************************************
Here are the key notes from the video transcript:

**Recipe Title:** Spaghetti and Meat Sauce (Claimed to be the "only one you'll ever make again" and "best ever")

**Key Features:**
*   **Time:** 30 minutes total preparation and cooking.
*   **Health:** Healthy, loaded with veggies.
*   **Flavor:** Delicious, full of flavor, deep and rich taste.
*   **Convenience:** Easy weeknight dinner.
*   **Secret:** Veggies chopped fine enough that "allergic to vegetables" family members won't notice.
*   **Technique:** Pasta finishes cooking *in the sauce* for maximum flavor.

**Ingredients:**

*   **Veggies:**
    *   Onion (rough chop)
    *   Garlic (5 cloves)
    *   Carrot (large, peeled or not, good wash)
    *   Celery (similar size chunks, including leaves)
*   **Fat:** Olive oil (generous "glug")
*   **Meats:**

In [52]:
# ----------------------------
# Use Gemini to Answer Based on Context
# ----------------------------
prompt = (
    "Answer the following QUESTION using DOCUMENT as context.\n"
    f"QUESTION: {query_text}\n"
    f"DOCUMENT: {results['documents'][0][0]}"
)

response = genai_model.generate_content(prompt, stream=False)
print("=== Gemini Response ===")
print(response.text)

=== Gemini Response ===
The provided document details a recipe for "Spaghetti and Meat Sauce." It does not contain any information about a beef ribs recipe.

For the spaghetti and meat sauce recipe, it calls for **1 lb Ground Beef** (85/15 fat content recommended).
