In [None]:
import pandas as pd
import numpy as np
import whisper
from pytube import YouTube

In [None]:
model = whisper.load_model('base')

In [None]:
youtube_video_url = "https://www.youtube.com/watch?v=Q2u0dKRA1cA"
youtube_video = YouTube(youtube_video_url)

In [None]:
watch_url = youtube_video.watch_url
watch_url

In [None]:
video_id = youtube_video.video_id
video_id

In [None]:
stream = youtube_video.streams.filter(only_audio=True).first()
stream

In [None]:
stream.download(filename=f"./mp4/{video_id}.mp4")

In [None]:
# do the transcription
output = model.transcribe(f"./mp4/{video_id}.mp4", fp16=False)

In [None]:
output

In [None]:
segments = output["segments"]
segments

In [None]:
df = pd.DataFrame(segments)
df.head(5)

In [None]:
# Cache the transcription
df.to_csv(f"./csv/{video_id}.csv", index=False)

In [None]:
df_backup = pd.read_csv(f"./csv/{video_id}.csv")
df_backup.head(3)

In [None]:
df_export = df_backup[['id', 'start', 'text']]
df_export.head(5)

In [None]:
df_export.insert(0, 'video_id', video_id)

In [None]:
df_export.insert(0, 'watch_url', watch_url)

In [None]:
# Lambda function to modify the URL column
add_time = lambda row: row['watch_url'] + '&t=' + str(int(row['start']))

In [None]:
df_export.loc[:, 'watch_url'] = df_export.apply(add_time, axis=1)

In [None]:
df_export

In [None]:
df_export.to_csv(f"./csv/{video_id}_export.csv", index=False)

In [None]:
df_embed = pd.read_csv(f"./csv/{video_id}_export.csv")
df_embed.head(3)

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
ollama_emb = OllamaEmbeddings(
    model="nomic-embed-text",
)

In [None]:
# Lambda function to modify the URL column
add_embed = lambda x: ollama_emb.embed_query(x['text'])

In [None]:
df_embed.loc[:, 'text_embed'] = df_embed.apply(add_embed, axis=1)

In [None]:
df_embed.head(3)

In [None]:
len_embed = len(df_embed['text_embed'].iloc[0])
len_embed

In [None]:
df_embed.to_excel(f"./xlsx/{video_id}_embed.xlsx", sheet_name=video_id, index=False)

In [None]:
search_term = 'yearly return without fees'
search_term

In [None]:
search_term_embed = ollama_emb.embed_query(search_term)
search_term_embed

In [None]:
len_search_term = len(search_term_embed)
len_search_term

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
df_embed.loc[:, 'cosine_similarity'] = df_embed['text_embed'].apply(lambda x: cosine_similarity(x, search_term_embed))

In [None]:
df_embed.head(5)

In [None]:
df_sorted = df_embed.sort_values(by='cosine_similarity', ascending=False)
df_sorted

In [None]:
jump_to_location = df_sorted['watch_url'].iloc[0]
jump_to_location