In [2]:
# Download datasets from kaggle

import json
import os

def getPath(fname):
  return os.path.join('C://Users/Sarah/Code/CS_452/byu-cs-452-class-content/vectordb/', fname)

if not os.path.exists("lex-fridman-text-embedding-3-large-128.zip"):
  configpath = getPath('config.json')
  with open(configpath) as configfile:
    config = json.load(configfile)

  kaggle_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
  os.makedirs(kaggle_dir, exist_ok=True)
  kaggle_config_path = os.path.join(kaggle_dir, "kaggle.json")
  with open(kaggle_config_path, 'w') as f:
    json.dump(config, f)

  !kaggle datasets download -d michaeltreynolds/lex-fridman-text-embedding-3-large-128


In [None]:
# Unzip kaggle data

!unzip lex-fridman-text-embedding-3-large-128.zip
!unzip lex-fridman-text-embedding-3-large-128/*.zip


'Expand-Archive' is not recognized as an internal or external command,
operable program or batch file.
'Expand-Archive' is not recognized as an internal or external command,
operable program or batch file.


In [10]:
# Use specific libraries
%pip install datasets==2.20.0 psycopg2==2.9.9 pgcopy==1.6.0
import psycopg2

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Sarah\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
# Get your own trial account at timescaledb and paste your own connection string

CONNECTION = "postgres://tsdbadmin:x36cj6tn7tnmdie8@kbfzfqp2mf.mg8e7uyy7l.tsdb.cloud.timescale.com:32262/tsdb?sslmode=require"

In [71]:
# Use this if you want to start over on your postgres table!

DROP_TABLE = "DROP TABLE IF EXISTS podcast, segment"
with psycopg2.connect(CONNECTION) as conn:
    cursor = conn.cursor()
    cursor.execute(DROP_TABLE)
    conn.commit() # Commit the changes


In [2]:
# Useful function that takes a pd.DataFrame and copies it directly into a table.

import pandas as pd
import io
import psycopg2

from typing import List

def fast_pg_insert(df: pd.DataFrame, connection: str, table_name: str, columns: List[str]) -> None:
    """
        Inserts data from a pandas DataFrame into a PostgreSQL table using the COPY command for fast insertion.

        Parameters:
        df (pd.DataFrame): The DataFrame containing the data to be inserted.
        connection (str): The connection string to the PostgreSQL database.
        table_name (str): The name of the target table in the PostgreSQL database.
        columns (List[str]): A list of column names in the target table that correspond to the DataFrame columns.

        Returns:
        None
    """
    conn = psycopg2.connect(connection)
    _buffer = io.StringIO()
    df.to_csv(_buffer, sep=";", index=False, header=False)
    _buffer.seek(0)
    with conn.cursor() as c:
        c.copy_from(
            file=_buffer,
            table=table_name,
            sep=";",
            columns=columns,
            null=''
        )
    conn.commit()
    conn.close()

Database Schema
We will create a database with two tables: podcast and segment:

**podcast**

-   PK: id
-   The unique podcast id found in the huggingface data (i,e., TRdL6ZzWBS0 is the ID for Jed Buchwald: Isaac Newton and the Philosophy of Science | Lex Fridman Podcast #214)
-   title
-   The title of podcast (ie., Jed Buchwald: Isaac Newton and the Philosophy of Science | Lex Fridman Podcast #214)

**segment**

-   PK: id
-   the unique identifier for the podcast segment. This was created by concatenating the podcast idx and the segment index together (ie., "0;1") is the 0th podcast and the 1st segment
    This is present in the as the "custom_id" field in the `embedding.jsonl` and batch_request.jsonl files
-   start_time
-   The start timestamp of the segment
-   end_time
-   The end timestamp of the segment
-   content
-   The raw text transcription of the podcast
-   embedding
-   the 128 dimensional vector representation of the text
-   FK: podcast_id
-   foreign key to podcast.id


In [None]:
# Sample document:
# {
#   "custom_id": "89:115",
#   "url": "/v1/embeddings",
#   "method": "POST",
#   "body": {
#     "input": " have been possible without these approaches?",
#     "model": "text-embedding-3-large",
#     "dimensions": 128,
#     "metadata": {
#       "title": "Podcast: Boris Sofman: Waymo, Cozmo, Self-Driving Cars, and the Future of Robotics | Lex Fridman Podcast #241",
#       "podcast_id": "U_AREIyd0Fc",
#       "start_time": 484.52,
#       "stop_time": 487.08
#     }
#   }
# }

# Sample embedding:
# {
#   "id": "batch_req_QZBmHS7FBiVABxcsGiDx2THJ",
#   "custom_id": "89:115",
#   "response": {
#     "status_code": 200,
#     "request_id": "7a55eba082c70aca9e7872d2b694f095",
#     "body": {
#       "object": "list",
#       "data": [
#         {
#           "object": "embedding",
#           "index": 0,
#           "embedding": [
#             0.0035960325,
#             126 more lines....
#             -0.093248844
#           ]
#         }
#       ],
#       "model": "text-embedding-3-large",
#       "usage": {
#         "prompt_tokens": 7,
#         "total_tokens": 7
#       }
#     }
#   },
#   "error": null
# }

In [73]:
# Create table statements that you'll write

# may need to run this to enable vector data type if you didn't select AI in service
CREATE_EXTENSION = "CREATE EXTENSION if not exists vector;"

CREATE_PODCAST_TABLE = """
  CREATE TABLE if not exists podcast (
    id varchar(30) PRIMARY KEY,
    title varchar(255)
  );
"""

CREATE_SEGMENT_TABLE = """
  CREATE TABLE if not exists segment (
    id varchar(30) PRIMARY KEY,
    start_time float,
    end_time float,
    content varchar(255),
    embedding vector(128),
    podcast_id varchar(30) references podcast (id)
  );
"""

conn = psycopg2.connect(CONNECTION)
conn.autocommit = True
cursor = conn.cursor()

# Create tables with psycopg2 (example: https://www.geeksforgeeks.org/executing-sql-query-with-psycopg2-in-python/)
cursor.execute(CREATE_EXTENSION)
cursor.execute(CREATE_PODCAST_TABLE)
cursor.execute(CREATE_SEGMENT_TABLE)

conn.commit()
conn.close()


In [74]:
## Extract needed data out of JSONL files. This may be the hard part!

podcast_dict = dict()
segment_df_list = []

documents_dir = getPath('documents/')
embeddings_path = getPath('embedding/embedding/')

# Loop through the document directory, getting the corresponding embedding
for entry in os.scandir(documents_dir):
  document_path = entry.path
  embedding_path = document_path.replace('documents/batch_request_', 'embedding/embedding/')
  print(document_path)

  doc_data = []

  # open document file
  with open(document_path, 'r', encoding='utf-8') as file:
    for line in file:
      try:
        # read data, adding new information to podcast_dict in the process
        curr = json.loads(line)
        metadata = curr["body"]["metadata"]
        doc_data.append({ 
          "id": curr["custom_id"], 
          "start_time": metadata["start_time"], 
          "end_time": metadata["stop_time"], 
          "content": curr["body"]["input"][0:255], 
          "podcast_id": metadata["podcast_id"]
        })
        if(metadata["podcast_id"] not in podcast_dict):
          podcast_dict[metadata["podcast_id"]] = metadata["title"][0:255]
          print("added ", metadata["podcast_id"])
      except json.JSONDecodeError:
        continue

  normalized_doc = pd.json_normalize(doc_data)

  embed_data = []

  # open embedding file
  with open(embedding_path, 'r', encoding='utf-8') as file:
    for line in file:
      try:
        curr_embed = json.loads(line)
        embedding = curr_embed["response"]["body"]["data"][0]["embedding"]
        embed_data.append({ "id": curr_embed["custom_id"], "embedding": embedding })
      except json.JSONDecodeError:
        continue

  normalized_embed = pd.json_normalize(embed_data)

  merged = normalized_doc.merge(normalized_embed, "inner", "id")
  
  segment_df = merged[['id', 'start_time', 'end_time', 'content', 'embedding', 'podcast_id']]
  segment_df_list.append(segment_df)

  # ['id', 'start_time', 'end_time', 'content', 'embedding', 'podcast_id']
  


C://Users/Sarah/Code/CS_452/byu-cs-452-class-content/vectordb/documents/batch_request_0lw3vrQqdWbdBRurTGNMHU76.jsonl
added  U_AREIyd0Fc
added  1C2tPFCGL1U
added  98HZanvAJ8Y
added  IHg6ixt3CKc
added  3FIo6evmweo
added  4oDZyOf6CW4
added  Whtt2H5_isM
added  qfKyNxfyWbo
added  KZkYSSE8HHI
added  A22Ej6kb2wo
added  4dC_nRYIDZU
added  RL4j4KPwNGM
added  ImKkaeUx1MU
added  OaeYUm06in0
added  OJQepiqSWvg
added  Z1KwkpTUbkg
added  ew8U43IXTfk
added  M1-v-dXIzho
added  CY_LEa9xQtg
added  dSVLjAdo8UA
added  ICj8p5jPd3Y
C://Users/Sarah/Code/CS_452/byu-cs-452-class-content/vectordb/documents/batch_request_3GozevpriRRzieX4za9xfNmY.jsonl
added  VeH7qKZr0WI
added  _AGPbvCDBCk
added  ktuw6Ow4sd0
added  fIPxfzfOTxk
added  6z1JwZbX4dQ
added  0ZO28NtkwwQ
added  KsZI5oXBC0k
added  a3Wpy6gE4So
added  urdNsyZBqhQ
added  B2tXN7ZnSfU
added  KOwm7GUjcg8
added  nvBEXXnNaNQ
added  Tj6NOfdfa4o
added  xlMTWfkQqbY
added  bQa7hpUpMzM
added  YUYagvESisE
added  FKh8hjJNhWc
added  IUHkhB366tE
added  EE1R8FYUJm0
added 

In [None]:
#### Optional #####
# In addition to the embedding and document files you might like to load
# the full podcast raw data via the hugging face datasets library

# from datasets import load_dataset
# ds = load_dataset("Whispering-GPT/lex-fridman-podcast")


In [75]:
# TODO Copy all the "podcast" data into the podcast postgres table!
# podcast_df = pd.DataFrame.from_dict(podcast_dict, orient='index', columns=['id', 'title'])

podcast_df = pd.DataFrame(data=podcast_dict.items(), columns=['id', 'title'])
# print(podcast_df)

fast_pg_insert(podcast_df, CONNECTION, 'podcast', ['id','title'])

In [76]:
# TODO Copy all the "segment" data into the segment postgres table!
# HINT 1: use the recommender.utils.fast_pg_insert function to insert data into the database
# otherwise inserting the 800k documents will take a very, very long time
# HINT 2: if you don't want to use all your memory and crash
# colab, you'll need to either send the data up in chunks
# or write your own function for copying it up. Alternative to chunking maybe start
# with writing it to a CSV and then copy it up?

for segment in segment_df_list:
  fast_pg_insert(segment, CONNECTION, 'segment', ['id', 'start_time', 'end_time', 'content', 'embedding', 'podcast_id'])

In [38]:
## This script is used to query the database
import os
import psycopg2


# Write your queries
# Q1) What are the five most similar segments to segment "267:476"
# Input: "that if we were to meet alien life at some point"
# For each result return the podcast name, the segment id, segment raw text,  the start time, stop time, and embedding distance

conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH topfive AS (
    SELECT *, embedding <-> (SELECT embedding from segment where id = '267:476') as distance FROM segment
    WHERE segment.id != '267:476'
    ORDER BY embedding <-> (SELECT embedding from segment where id = '267:476')
    LIMIT 5
  )
  SELECT podcast.title, topfive.id, content, start_time, end_time, topfive.distance
  FROM topfive
  JOIN podcast ON topfive.podcast_id = podcast.id
  ORDER BY distance;
""")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()

('Podcast: Ryan Graves: UFOs, Fighter Jets, and Aliens | Lex Fridman Podcast #308', '113:2792', ' encounters, human beings, if we were to meet another alien', 6725.62, 6729.86, 0.6483450674336982)
('Podcast: Richard Dawkins: Evolution, Intelligence, Simulation, and Memes | Lex Fridman Podcast #87', '268:1019', ' Suppose we did meet an alien from outer space', 2900.04, 2903.0800000000004, 0.6558106859320757)
('Podcast: Jeffrey Shainline: Neuromorphic Computing and Optoelectronic Intelligence | Lex Fridman Podcast #225', '305:3600', ' but if we think of alien civilizations out there', 9479.960000000001, 9484.04, 0.6595433115268592)
('Podcast: Michio Kaku: Future of Humans, Aliens, Space Travel & Physics | Lex Fridman Podcast #45', '18:464', ' So I think when we meet alien life from outer space,', 1316.8600000000001, 1319.5800000000002, 0.6662026419636159)
('Podcast: Alien Debate: Sara Walker and Lee Cronin | Lex Fridman Podcast #279', '71:989', ' because if aliens come to us', 2342.34, 2

In [39]:
# Q2) What are the five most dissimilar segments to segment "267:476"
# Input: "that if we were to meet alien life at some point"
# For each result return the podcast name, the segment id, segment raw text, the start time, stop time, and embedding distance

conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH bottomfive AS (
    SELECT *, embedding <-> (SELECT embedding from segment where id = '267:476') as distance FROM segment
    WHERE segment.id != '267:476'
    ORDER BY embedding <-> (SELECT embedding from segment where id = '267:476') DESC
    LIMIT 5
  )
  SELECT podcast.title, bottomfive.id, content, start_time, end_time, distance
  FROM bottomfive
  JOIN podcast ON bottomfive.podcast_id = podcast.id
  ORDER BY distance DESC;
""")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()

('Podcast: Jason Calacanis: Startups, Angel Investing, Capitalism, and Friendship | Lex Fridman Podcast #161', '119:218', ' a 73 Mustang Grande in gold?', 519.96, 523.8000000000001, 1.6157687685840119)
('Podcast: Rana el Kaliouby: Emotion AI, Social Robots, and Self-Driving Cars | Lex Fridman Podcast #322', '133:2006', ' for 94 car models.', 5818.62, 5820.82, 1.5863359073014982)
('Podcast: Travis Stevens: Judo, Olympics, and Mental Toughness | Lex Fridman Podcast #223', '283:1488', ' when I called down to get the sauna.', 3709.34, 3711.1000000000004, 1.572552805197421)
('Podcast: Jeremy Howard: fast.ai Deep Learning Courses and Research | Lex Fridman Podcast #35', '241:1436', ' which has all the courses pre-installed.', 4068.9, 4071.1400000000003, 1.5663319710412156)
('Podcast: Joscha Bach: Nature of Reality, Dreams, and Consciousness | Lex Fridman Podcast #212', '307:3933', ' and very few are first class and some are budget.', 10648.64, 10650.960000000001, 1.5616341289820461)


In [40]:
# Q3) What are the five most similar segments to segment '48:511'

# Input: "Is it is there something especially interesting and profound to you in terms of our current deep learning neural network, artificial neural network approaches and the whatever we do understand about the biological neural network."
# For each result return the podcast name, the segment id, segment raw text,  the start time, stop time, and embedding distance

conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH topfive AS (
    SELECT *, embedding <-> (SELECT embedding from segment where id = '48:511') as distance FROM segment
    WHERE segment.id != '48:511'
    ORDER BY embedding <-> (SELECT embedding from segment where id = '48:511')
    LIMIT 5
  )
  SELECT podcast.title, topfive.id, content, start_time, end_time, distance
  FROM topfive
  JOIN podcast ON topfive.podcast_id = podcast.id
  ORDER BY distance;
""")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()


('Podcast: Andrew Huberman: Neuroscience of Optimal Performance | Lex Fridman Podcast #139', '155:648', ' Is there something interesting to you or fundamental to you about the circuitry of the brain', 3798.48, 3805.84, 0.652299685331962)
('Podcast: Cal Newport: Deep Work, Focus, Productivity, Email, and Social Media | Lex Fridman Podcast #166', '61:3707', ' of what we might discover about neural networks?', 8498.02, 8500.1, 0.7121050124628524)
('Podcast: Matt Botvinick: Neuroscience, Psychology, and AI at DeepMind | Lex Fridman Podcast #106', '48:512', " And our brain is there. There's some there's quite a few differences. Are some of them to you either interesting or perhaps profound in terms of in terms of the gap we might want to try to close in trying to create a human level intelligence.", 1846.84, 1865.84, 0.7195603322334674)
('Podcast: Yann LeCun: Dark Matter of Intelligence and Self-Supervised Learning | Lex Fridman Podcast #258', '276:2642', ' Have these, I mean, small pockets

In [41]:
# Q4) What are the five most similar segments to segment '51:56'

# Input: "But what about like the fundamental physics of dark energy? Is there any understanding of what the heck it is?"
# For each result return the podcast name, the segment id, segment raw text,  the start time, stop time, and embedding distance

conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH topfive AS (
    SELECT *, embedding <-> (SELECT embedding from segment where id = '51:56') as distance FROM segment
    WHERE segment.id != '51:56'
    ORDER BY embedding <-> (SELECT embedding from segment where id = '51:56')
    LIMIT 5
  )
  SELECT podcast.title, topfive.id, content, start_time, end_time, distance
  FROM topfive
  JOIN podcast ON topfive.podcast_id = podcast.id
  ORDER BY distance;
""")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()

('Podcast: George Hotz: Hacking the Simulation & Learning to Drive with Neural Nets | Lex Fridman Podcast #132', '308:144', " I mean, we don't understand dark energy, right?", 500.44, 502.6, 0.6681965222094363)
('Podcast: Lex Fridman: Ask Me Anything - AMA January 2021 | Lex Fridman Podcast', '243:273', " Like, what's up with this dark matter and dark energy stuff?", 946.22, 950.12, 0.7355511762966292)
('Podcast: Katherine de Kleer: Planets, Moons, Asteroids & Life in Our Solar System | Lex Fridman Podcast #184', '196:685', ' being like, what the hell is dark matter and dark energy?', 2591.72, 2595.9599999999996, 0.7631141596843518)
('Podcast: Alex Filippenko: Supernovae, Dark Energy, Aliens & the Expanding Universe | Lex Fridman Podcast #137', '51:36', ' Do we have any understanding of what the heck that thing is?', 216.0, 219.0, 0.7922019445543276)
('Podcast: Leonard Susskind: Quantum Mechanics, String Theory and Black Holes | Lex Fridman Podcast #41', '122:831', ' That is a big ques

In [43]:
# Q5) For each of the following podcast segments, find the five most similar podcast episodes. Hint: You can do this by averaging over the embedding vectors within a podcast episode.

#     a) Segment "267:476"
conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH podEp AS (
    SELECT podcast_id from segment
    WHERE segment.id = '267:476'
  ),
  podEmbed AS (
    SELECT AVG(embedding) as avg_embed FROM segment
    WHERE podcast_id = (SELECT podcast_id from podEp)
    GROUP BY podcast_id
  ),
  topfive AS (
    SELECT AVG(embedding) as avg_embed, podcast_id, AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed) as distance FROM segment
    WHERE podcast_id NOT IN (SELECT podcast_id from podEp)
    GROUP BY podcast_id
    ORDER BY AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed)
    LIMIT 5
  )
  SELECT podcast.title, topfive.podcast_id, distance
  FROM topfive
  JOIN podcast ON topfive.podcast_id = podcast.id
  ORDER BY distance;
""")
print("\nTop 5 most similar to segment '267:476'")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()

#     b) Segment '48:511'
conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH podEp AS (
    SELECT podcast_id from segment
    WHERE segment.id = '48:511'
  ),
  podEmbed AS (
    SELECT AVG(embedding) as avg_embed FROM segment
    WHERE podcast_id = (SELECT podcast_id from podEp)
    GROUP BY podcast_id
  ),
  topfive AS (
    SELECT AVG(embedding) as avg_embed, podcast_id, AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed) as distance FROM segment
    WHERE podcast_id NOT IN (SELECT podcast_id from podEp)
    GROUP BY podcast_id
    ORDER BY AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed)
    LIMIT 5
  )
  SELECT podcast.title, topfive.podcast_id, distance
  FROM topfive
  JOIN podcast ON topfive.podcast_id = podcast.id
  ORDER BY distance;
""")
print("\nTop 5 most similar to segment '48:511'")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()

#     c) Segment '51:56'
conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH podEp AS (
    SELECT podcast_id from segment
    WHERE segment.id = '51:56'
  ),
  podEmbed AS (
    SELECT AVG(embedding) as avg_embed FROM segment
    WHERE podcast_id = (SELECT podcast_id from podEp)
    GROUP BY podcast_id
  ),
  topfive AS (
    SELECT AVG(embedding) as avg_embed, podcast_id, AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed) as distance FROM segment
    WHERE podcast_id NOT IN (SELECT podcast_id from podEp)
    GROUP BY podcast_id
    ORDER BY AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed)
    LIMIT 5
  )
  SELECT podcast.title, topfive.podcast_id, distance
  FROM topfive
  JOIN podcast ON topfive.podcast_id = podcast.id
  ORDER BY distance;
""")
print("\nTop 5 most similar to segment '51:56'")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()

# For each result return the Podcast title and the embedding distance



Top 5 most similar to segment '267:476'
('Podcast: Demis Hassabis: DeepMind - AI, Superintelligence & the Future of Humanity | Lex Fridman Podcast #299', 'Gfr50f6ZBvo', 0.11301193616129587)
('Podcast: Oriol Vinyals: DeepMind AlphaStar, StarCraft, and Language | Lex Fridman Podcast #20', 'Kedt2or9xlo', 0.11327739413045972)
('Podcast: Ilya Sutskever: Deep Learning | Lex Fridman Podcast #94', '13CZPWmke6A', 0.12476937431692706)
('Podcast: Greg Brockman: OpenAI and AGI | Lex Fridman Podcast #17', 'bIrEM2FbOLU', 0.13182197779463417)
('Podcast: Gary Marcus: Toward a Hybrid of Deep Learning and Symbolic AI | Lex Fridman Podcast #43', 'vNOTDn3D_RI', 0.13208838417321903)

Top 5 most similar to segment '48:511'
('Podcast: Tomaso Poggio: Brains, Minds, and Machines | Lex Fridman Podcast #13', 'aSyZvBrPAyk', 0.12102781315668082)
('Podcast: Jay McClelland: Neural Networks and the Emergence of Cognition | Lex Fridman Podcast #222', 'Ui38ZzTymDY', 0.12674422316158854)
('Podcast: Dileep George: Brain

In [44]:
# Q6) For podcast episode id = VeH7qKZr0WI, find the five most similar podcast episodes. Hint: you can do a similar averaging procedure as Q5

# Input Episode: "Balaji Srinivasan: How to Fix Government, Twitter, Science, and the FDA | Lex Fridman Podcast #331"
# For each result return the Podcast title and the embedding distance

conn = psycopg2.connect(CONNECTION)
cur = conn.cursor()

cur.execute("""
  WITH podEmbed AS (
    SELECT AVG(embedding) as avg_embed FROM segment
    WHERE podcast_id = 'VeH7qKZr0WI'
    GROUP BY podcast_id
  ),
  topfive AS (
    SELECT AVG(embedding) as avg_embed, podcast_id, AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed) as distance FROM segment
    WHERE podcast_id != 'VeH7qKZr0WI'
    GROUP BY podcast_id
    ORDER BY AVG(embedding) <-> (SELECT podEmbed.avg_embed from podEmbed)
    LIMIT 5
  )
  SELECT podcast.title, topfive.podcast_id, distance
  FROM topfive
  JOIN podcast ON topfive.podcast_id = podcast.id
  ORDER BY distance;
""")
print("\nTop 5 most similar to podcast 'VeH7qKZr0WI'")
for row in cur.fetchall():
  print(row)

conn.commit()
conn.close()


Top 5 most similar to podcast 'VeH7qKZr0WI'
('Podcast: Tyler Cowen: Economic Growth & the Fight Against Conformity & Mediocrity | Lex Fridman Podcast #174', '7Grseeycor4', 0.11950103776872197)
('Podcast: Eric Weinstein: Difficult Conversations, Freedom of Speech, and Physics | Lex Fridman Podcast #163', 'ifX_JnBfxTY', 0.1257139025632404)
('Podcast: Michael Malice and Yaron Brook: Ayn Rand, Human Nature, and Anarchy | Lex Fridman Podcast #178', 'Pl3x4GINtBQ', 0.12842690324343972)
('Podcast: Steve Keen: Marxism, Capitalism, and Economics | Lex Fridman Podcast #303', '1XGiTDWfdpM', 0.12916269225753493)
('Podcast: Michael Malice: The White Pill, Freedom, Hope, and Happiness Amidst Chaos | Lex Fridman Podcast #150', 'uykM3NhJbso', 0.13040864953585687)


# Deliverables

You will turn in a ZIP or PDF file containing all your code and a PDF file with the queries and results for questions 1-7.
