In [1]:
import requests
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
from dotenv import load_dotenv
import urllib.parse
import os
import openai
import json

load_dotenv()

YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

openai.api_key = OPENAI_API_KEY


# Video Transcript Search

This notebooks goal is to find the most relevant timestamp in a video given a user's natural language description. With online video lectures, tutorials, skimming through can be tedious.

This takes the following steps:

1. Parsing input from youtube api - i.e, given a playlist url or video url, get the transcript
2. Chunking the transcript + associating chunks with metadata like timestamps
3. Doing a similarity search across those chunks given a user query

Ultimately, we want to do this on a playlist level, but a single long video for now will suffice. I included some boilerplate for grabbing playlist info to extend this later.

### Parse the playlist id from the url + read the playlist contents

In [2]:
# Official tutorial playlist on pinecone
pinecone_playlist_url = "https://www.youtube.com/playlist?list=PLRLVhGQeJDTLiw-ZJpgUtZW-bseS2gq9-"

# Deep Dive long form tutorial on langchain agents
langchain_video_url = "https://www.youtube.com/watch?v=jSP-gSEyVeI"

def get_playlist_from_url(playlist_url):

    parsed_url = urllib.parse.urlparse(playlist_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)
    playlist_id = query_params["list"][0]

    # Get the playlist ID
    # playlist_id = "PLRLVhGQeJDTLiw-ZJpgUtZW-bseS2gq9-"

    url = "https://www.googleapis.com/youtube/v3/playlistItems"

    params = {
        'part': 'snippet',
        'maxResults': 25,
        'playlistId': playlist_id,
        'key': YOUTUBE_API_KEY
    }

    response = requests.get(url, params=params)
    return response.json()

def get_video_from_url(video_url):

    parsed_url = urllib.parse.urlparse(video_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)
    video_id = query_params["v"][0]

    return YouTubeTranscriptApi.get_transcript(video_id)


playlist_data = get_playlist_from_url(pinecone_playlist_url)


### Parsing a playlist

To search transcripts across a playlist, we would read all the playlist transcripts and create embeddings

For now, we will focus on our longer video example, to ensure that intra-video search is solid

In [3]:
video_transcripts = {}

for item in playlist_data['items']:

    video_id = item['snippet']['resourceId']['videoId']
    title = item['snippet']['title']
    print(f"Video ID: {video_id}, Title: {title}")

    # read transcipt of a given video
        
    # transcript = YouTubeTranscriptApi.get_transcript(video_id)
    # video_transcripts[video_id] = transcript

Video ID: Q6616MuRmKU, Title: Pinecone #1 - Getting Started
Video ID: DCQrrnFbLt8, Title: Pinecone #2 - Managing Indexes
Video ID: HjeW6ed2dmI, Title: Pinecone #3 - Inserting Data
Video ID: cqzWyNWU8oo, Title: Pinecone #4 - Managing Data
Video ID: iWzjI0ubQEU, Title: Pinecone #5 - Querying Data
Video ID: tn_Y19oB5bs, Title: Pinecone #6 - Metadata Filters


### Example Video Transcript: formatted vs raw & timestamped

In [4]:
transcript = get_video_from_url(langchain_video_url)

formatter = TextFormatter()
formatted_transcript = formatter.format_transcript(transcript).replace("\n", " ")

f"length of formatted transcript string: {len(formatted_transcript)}"

'length of formatted transcript string: 26782'

In [5]:
string_idx = 0

for i,obj in enumerate(transcript):

    obj["string_index"] = string_idx
    string_idx += len(obj["text"]) + 1 # this + 1 is to account for the whitespace during the join

    transcript[i] = obj

print("raw transcript obj")
transcript[0]

raw transcript obj


{'text': 'large language models are incredibly',
 'start': 0.0,
 'duration': 6.12,
 'string_index': 0}

# Search Strategy:


Given a user query, i.e "i want to learn how to setup a pinecone index", return the most relevant chunk of text + a timestamp.

## Chunking


- Chunk the transcript text ~100 words?, associate metadata with each chunk like {chunk, beginning timestamp, video id}
- Create vector embeddings from the chunk, and store the embedding + associated metadata in pinecone
- On a user query, retrieve the most relevant chunk and meta data - call a tool to 1) Give a short answer to the user query and 2) play the video @ timestamp

However, this is restricted to timstamps at each chunk. If we want the timestamp search to be more exact, we'll have to chunk smaller, or get creative. 


## Fine grain search?

Approach 1: Search within chunk

- Once the most relevant chunk is retrieved, do another search, possibly calling the completions api - i.e, where in this chunk is this query answered
- Use an exact text output to retrieve the timestamp within the chunk

Approach 2:

- Build chunk with timestamp. I.e, every 30 seconds of video will correspond to a chunk

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# chunk by an arbitrary chunk size - a potential improvement is using spacy or NLTK as the splitter
def chunk_by_text(text, chunk_size = 500, chunk_overlap = 20):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap
    )

    docs = []  # List holding all the documents

    for i,chunk in enumerate(text_splitter.split_text(text)):
        # Generate documents
        string_index = i * (chunk_size - chunk_overlap)

        docs.append(Document(
            page_content=chunk, 
            metadata={
                # "string_index": i * (chunk_size - chunk_overlap)
                "chunk_timestamp": match_timestamp(string_index, transcript)
            }))

    return docs

# this function basically walks through the timestamps, and finds the closest one before the given chunk
# definitely room for optimization here - I think its o(n^2) when there is o(n) solution

def match_timestamp(string_index, raw_transcript):

    for i,timestamp_obj in enumerate(raw_transcript):

        if string_index == 0:
            return raw_transcript[0]["start"]
            

        elif timestamp_obj['string_index'] == string_index:
            return raw_transcript[i-1]["start"]
             

        elif timestamp_obj['string_index'] > string_index:
            return raw_transcript[i-1]["start"]
            

        elif i == len(raw_transcript) - 1:
            return raw_transcript[-1]["start"]
            
        

chunked_text = chunk_by_text(formatted_transcript)

[Document(page_content="large language models are incredibly powerful as we've seen but they lack some of the abilities that even the dumbest computer programs can handle with ease logic calculations and search are just a few examples of where large language models fail and really dumb computer programs um can actually perform very well we've been using computers to solve incredibly complex calculations for a very long time yet if we ask gbt4 to tell us the answer to what is 4.1 multiplied by 7.9 it actually fails", metadata={'chunk_timestamp': 0.0}),
 Document(page_content="it actually fails isn't it fascinating that simple calculator program can do this but what is probably the most sophisticated AI program in the world right now that is accessible by us cannot and that's not all if I ask GT4 my small overused example by now of how do I use the LM in line chain it struggles again it's true that line chain was a blockchain project yeah there didn't seem to be any nlm chain component n

In [7]:
chunked_text[0]

Document(page_content="large language models are incredibly powerful as we've seen but they lack some of the abilities that even the dumbest computer programs can handle with ease logic calculations and search are just a few examples of where large language models fail and really dumb computer programs um can actually perform very well we've been using computers to solve incredibly complex calculations for a very long time yet if we ask gbt4 to tell us the answer to what is 4.1 multiplied by 7.9 it actually fails", metadata={'chunk_timestamp': 0.0})

### Create embeddings and do similarity search

In [8]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
import tiktoken

embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(chunked_text, embeddings)

# Experiment

To experiment, change the query and run the following two cells - you can go to the reference video here:

https://www.youtube.com/watch?v=jSP-gSEyVeI

The last cell will return a link to the predicted timestamp, and say the timestamp too. Double check, because sometimes
youtube caches your video progress and overrides the timestamp parameter

In [9]:
query = "How do I create a langchain agent"
similar_docs = db.similarity_search_with_score(query)

similar_docs

[(Document(page_content="and there are things you can do you can create your own agent you can use agents with several other tools and another thing worth mentioning is that you can use a tracing UI tool that is within Lang chain which will allow you to understand within a beautiful UI how the agent is thinking on what different calls to different llms it did within its thought process so that is really convenient when you're using complex agents with several tools and it might be tricky to track what the whole thought", metadata={'chunk_timestamp': 1857.72}),
  0.3301338),
 (Document(page_content="and it can even perform SQL queries let's start with a very simple example of this what we're going to do is build a calculator agent that can also handle some general knowledge queries now to use agents in line chain we need three key components that is a large language model or multiple large language models a tool that we will be interacting with and an Asian to control the interaction le

In [10]:
best_doc = similar_docs[-1][0]
seconds = best_doc.metadata["chunk_timestamp"]
s = f"https://www.youtube.com/watch?v=jSP-gSEyVeI&t={seconds}"

def convert_seconds(seconds):
    minutes, seconds = divmod(seconds, 60)
    return int(minutes), int(seconds)

# Example usage:
minutes, seconds = convert_seconds(seconds)


print(s)
print(f"exact timestamp - {minutes}:{seconds}")

https://www.youtube.com/watch?v=jSP-gSEyVeI&t=992.699
exact timestamp - 16:32
