<a href="https://colab.research.google.com/github/byu-cs-452/byu-cs-452-class-content/blob/main/embed/VectorDB_Lab_CS452_(starter).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download dataset from GitHub releases
# Total download size: ~613 MB (raw_data: 29 MB, embeddings: 584 MB)

import os

print("Downloading dataset files...")
if not os.path.exists("raw_data.zip"):
  print("  → Downloading raw_data.zip (29 MB)...")
  !wget -q --show-progress https://github.com/byu-cs-452/byu-cs-452-class-content/releases/download/v1.0-lex-fridman-dataset/raw_data.zip
  print("  ✓ raw_data.zip downloaded")

if not os.path.exists("embeddings.zip"):
  print("  → Downloading embeddings.zip (584 MB, this may take a minute)...")
  !wget -q --show-progress https://github.com/byu-cs-452/byu-cs-452-class-content/releases/download/v1.0-lex-fridman-dataset/embeddings.zip
  print("  ✓ embeddings.zip downloaded")

print("✓ All files downloaded!")

In [None]:
# Unzip data files
# This will extract batch_request.jsonl and embeddings.jsonl

print("Extracting data files...")
!unzip -n raw_data.zip
!unzip -n embeddings.zip
print("✓ Extraction complete!")
print("\nExtracted files:")
!ls -lh *.jsonl

In [None]:
# Install required libraries
# Note: Specific versions ensure compatibility with the lab

!pip install -q datasets==2.20.0 psycopg2-binary==2.9.9

print("✓ Libraries installed successfully!")

In [None]:
# Import libraries
import psycopg2
import pandas as pd
import json

print("✓ Libraries imported successfully!")

In [None]:
# Option 1: Run this to set up a LOCAL PostgreSQL database in Colab
# This will take ~2-3 minutes to complete

# Configure database name
DATABASE_NAME = "embedding_project"

print("Setting up PostgreSQL locally (this takes ~2-3 minutes)...")
print("=" * 60)

print("\n[1/5] Installing PostgreSQL...")
!apt update > /dev/null 2>&1
!apt install -y postgresql postgresql-contrib > /dev/null 2>&1

print("[2/5] Starting PostgreSQL service...")
!service postgresql start

print("[3/5] Creating database user...")
!sudo -u postgres psql -c "CREATE USER root WITH SUPERUSER PASSWORD 'root'" 2>/dev/null || echo "  (User already exists)"

print(f"[4/5] Creating database '{DATABASE_NAME}'...")
!sudo -u postgres psql -c "CREATE DATABASE {DATABASE_NAME}" 2>/dev/null || echo "  (Database already exists)"

print("[5/5] Installing pgvector extension...")
!apt install -y postgresql-server-dev-all build-essential > /dev/null 2>&1
!git clone --quiet --branch v0.8.0 https://github.com/pgvector/pgvector.git 2>/dev/null || echo "  (Already cloned)"
!cd pgvector && make > /dev/null 2>&1 && make install > /dev/null 2>&1
!sudo -u postgres psql -d {DATABASE_NAME} -c "CREATE EXTENSION IF NOT EXISTS vector;" 2>/dev/null

CONNECTION = f"postgresql://root:root@localhost:5432/{DATABASE_NAME}"

print("\n" + "=" * 60)
print("✓ PostgreSQL setup complete!")
print(f"✓ Connection string: {CONNECTION}")
print("\nYou can now proceed with creating tables and loading data.")

In [None]:
# Option 2: Use TimescaleDB cloud service instead of local PostgreSQL
# 
# If you prefer to use TimescaleDB (cloud-hosted):
# 1. Sign up for a free trial at https://www.timescale.com/
# 2. Create a new service
# 3. Copy your connection string and paste it below
# 4. Run this cell (and skip Cell 5 above)

# Uncomment and add your connection string:
# CONNECTION = "postgresql://username:password@host:port/database"

In [None]:
# ⚠️  CAUTION: Use this to reset your database (deletes all data!)
# Only run this if you want to start completely over

import psycopg2

DROP_TABLES = "DROP TABLE IF EXISTS segment, podcast CASCADE"

# Uncomment the lines below to actually drop the tables:
# with psycopg2.connect(CONNECTION) as conn:
#     cursor = conn.cursor()
#     cursor.execute(DROP_TABLES)
#     conn.commit()
#     print("✓ Tables dropped successfully. You can now recreate them.")

print("⚠️  Drop command is commented out for safety.")
print("   Uncomment the code above if you really want to reset the database.")

In [None]:
def fast_pg_insert(df: pd.DataFrame, connection: str, table_name: str, columns: List[str]) -> None:
    """
    Inserts data from a pandas DataFrame into a PostgreSQL table using the COPY command for fast insertion.
    Uses a temporary CSV file to avoid memory issues with large DataFrames.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data to be inserted.
    connection (str): The connection string to the PostgreSQL database.
    table_name (str): The name of the target table in the PostgreSQL database.
    columns (List[str]): A list of column names that must exist in the DataFrame.
                        Data will be inserted in this exact order.

    Returns:
    None
    """
    if not columns:
        raise ValueError("columns parameter cannot be empty")
    
    # Validate that all required columns exist in the DataFrame
    missing_cols = set(columns) - set(df.columns)
    if missing_cols:
        raise ValueError(f"DataFrame is missing requested columns: {missing_cols}\n"
                        f"DataFrame has: {list(df.columns)}")
    
    print(f"Inserting {len(df):,} rows into '{table_name}' table...")
    print(f"  Columns: {columns}")
    
    conn = psycopg2.connect(connection)
    csv_file = f"{table_name}_temp.csv"
    
    # Write only the specified columns to CSV file in the exact order specified
    print("  → Writing to CSV file...")
    df[columns].to_csv(csv_file, sep=";", index=False, header=False)
    
    # Copy from file to database
    print("  → Copying data to PostgreSQL...")
    with open(csv_file, 'r') as f:
        with conn.cursor() as c:
            c.copy_from(
                file=f,
                table=table_name,
                sep=";",
                columns=columns,
                null=''
            )
    
    conn.commit()
    conn.close()
    
    print(f"✓ Successfully inserted {len(df):,} rows into '{table_name}'")
    print(f"  (CSV file saved as '{csv_file}' for reference)")

Database Schema
We will create a database with two tables: podcast and segment:

**podcast**

- PK: id
 - The unique podcast id found in the huggingface data (i,e., TRdL6ZzWBS0  is the ID for Jed Buchwald: Isaac Newton and the Philosophy of Science | Lex Fridman Podcast #214)
- title
 - The title of podcast (ie., Jed Buchwald: Isaac Newton and the Philosophy of Science | Lex Fridman Podcast #214)

**segment**

- PK: id
 - the unique identifier for the podcast segment. This was created by concatenating the podcast idx and the segment index together (ie., "0;1") is the 0th podcast and the 1st segment
This is present in the as the "custom_id" field in the `embedding.jsonl` and batch_request.jsonl files
- start_time
 - The start timestamp of the segment
- end_time
 - The end timestamp of the segment
- content
 - The raw text transcription of the podcast
- embedding
 - the 128 dimensional vector representation of the text
- FK: podcast_id
 - foreign key to podcast.id

In [None]:
# Sample document:
# {
#   "custom_id": "89:115",
#   "url": "/v1/embeddings",
#   "method": "POST",
#   "body": {
#     "input": " have been possible without these approaches?",
#     "model": "text-embedding-3-large",
#     "dimensions": 128,
#     "metadata": {
#       "title": "Podcast: Boris Sofman: Waymo, Cozmo, Self-Driving Cars, and the Future of Robotics | Lex Fridman Podcast #241",
#       "podcast_id": "U_AREIyd0Fc",
#       "start_time": 484.52,
#       "stop_time": 487.08
#     }
#   }
# }

# Sample embedding:
# {
#   "id": "batch_req_QZBmHS7FBiVABxcsGiDx2THJ",
#   "custom_id": "89:115",
#   "response": {
#     "status_code": 200,
#     "request_id": "7a55eba082c70aca9e7872d2b694f095",
#     "body": {
#       "object": "list",
#       "data": [
#         {
#           "object": "embedding",
#           "index": 0,
#           "embedding": [
#             0.0035960325,
#             126 more lines....
#             -0.093248844
#           ]
#         }
#       ],
#       "model": "text-embedding-3-large",
#       "usage": {
#         "prompt_tokens": 7,
#         "total_tokens": 7
#       }
#     }
#   },
#   "error": null
# }

In [None]:
# Create table statements that you'll write

# TODO: Add create table statement
CREATE_PODCAST_TABLE = """

"""

# TODO: Add create table statement
CREATE_SEGMENT_TABLE = """

"""

conn = psycopg2.connect(CONNECTION)
# TODO: Create tables with psycopg2 (example: https://www.geeksforgeeks.org/executing-sql-query-with-psycopg2-in-python/)


conn.commit()
conn.close()


In [None]:
# Learn about the data and extract it!
# TODO: What data do we need?
# TODO: What data is in the documents jsonl files?
# TODO: What data is in the embedding jsonl files?
# OPTIONAL: Take a look at the original hugging face dataset
# from datasets import load_dataset
# ds = load_dataset("Whispering-GPT/lex-fridman-podcast")


In [None]:
# Transform the data into a pandas data frame
# TODO: Get some pandas data frames for our two tables so we can copy the data in!

In [None]:
# Load the data into the database using fast_pg_insert (therwise inserting the 800k documents will take a very, very long time)
# TODO Copy all the "podcast" data into the podcast postgres table!
# TODO Copy all the "segment" data into the segment postgres table!

In [None]:
## This script is used to query the database
import os
import psycopg2


# Write your queries
# Q1) What are the five most similar segments to segment "267:476"
# Input: "that if we were to meet alien life at some point"
# For each result return the segment raw text, embedding distance, the segment id, and the podcast name

conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()
cur.execute("""

""")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()

In [None]:
# Q2) What are the five most dissimilar segments to segment "267:476"
# Input: "that if we were to meet alien life at some point"
# For each result return the segment raw text, embedding distance, the segment id, and the podcast name


In [None]:
# Q3) What are the five most similar segments to segment '48:511'
# Input: "Is it is there something especially interesting and profound to you in terms of our current deep learning neural network, artificial neural network approaches and the whatever we do understand about the biological neural network."
# For each result return the segment raw text, embedding distance, the segment id, and the podcast name


In [None]:
# Q4) What are the five most similar segments to segment '51:56'
# Input: "But what about like the fundamental physics of dark energy? Is there any understanding of what the heck it is?"
# For each result return the segment raw text, embedding distance, the segment id, and the podcast name


In [None]:
# Q5) For each of the following podcast segments, find the five most similar podcast episodes. Hint: You can do this by averaging over the embedding vectors within a podcast episode.

#     a) Segment "267:476"

#     b) Segment '48:511'

#     c) Segment '51:56'

# For each result return the Podcast title and the embedding distance


In [None]:
# Q6) For podcast episode id = VeH7qKZr0WI, find the five most similar podcast episodes. Hint: you can do a similar averaging procedure as Q5

# Input Episode: "Balaji Srinivasan: How to Fix Government, Twitter, Science, and the FDA | Lex Fridman Podcast #331"
# For each result return the Podcast title and the embedding distance


# Deliverables

Submit a **single PDF file** containing:

1. **Your code** - Include all cells with your solutions (you can use File → Print in Colab)
2. **Query results** - For each question (Q1-Q6), include:
   - The SQL query you wrote
   - The output/results from running the query
   - A brief explanation if needed
