In [1]:
# for fetching list of youtube videos
from youtubesearchpython import *
import pandas as pd

# for downloading audio and extracting text
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# for saving embedding to vectordb
import os
import math
import pinecone
import pandas as pd
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

import openai
import requests

import unicodedata
import difflib # for fuzzy string matching

  from tqdm.autonotebook import tqdm


In [2]:
# channels = ChannelsSearch('JacksonGalaxy')
# print(channels.result())
channel_id = "UCheL-cUqfzUB8dfM_rFOfDQ" #pull out id from channels above
playlist = Playlist(playlist_from_channel_id(channel_id))

while playlist.hasMoreVideos:
    print('Getting more videos...')
    playlist.getNextVideos()
    print(f'Videos retrieved: {len(playlist.videos)}')

Getting more videos...
Videos retrieved: 200
Getting more videos...
Videos retrieved: 300
Getting more videos...
Videos retrieved: 400
Getting more videos...
Videos retrieved: 500
Getting more videos...
Videos retrieved: 599
Getting more videos...
Videos retrieved: 699
Getting more videos...
Videos retrieved: 764


In [3]:
stor_metadata = pd.DataFrame()

# Excluding 'shorts' based on duration of videos
import time
time_format = '%H:%M:%S'
one_min = time.strptime('00:01:00', time_format)

count = 1
for v in playlist.videos:
    duration_str = v['duration']
    if len(duration_str) < 6:
        duration_str = '0:' + duration_str
    duration = time.strptime(duration_str, time_format)

    if duration > one_min:
        stor_metadata.loc[v['title'],'title']=v['title']
        stor_metadata.loc[v['title'],'link']=v['link']
        stor_metadata.loc[v['title'],'img']=v['thumbnails'][3]['url']
        stor_metadata.loc[v['title'],'id']=int(count)
        count = count + 1

In [4]:
print(len(stor_metadata))

# Download, transcribe, and embed in batches to avoid connection errors
batch_size = 10
num_batches = math.ceil(len(stor_metadata) / batch_size)

batch_number = 39  # manually updated
# batch 13, numbers 120-130 skipped

# Set chunks to add
start_idx = (batch_number-1) * batch_size
end_idx = min(start_idx + batch_size, len(stor_metadata))
    
# Extract the current chunks
current_batch = stor_metadata[start_idx:end_idx]
print(start_idx)
print(end_idx)
stor_metadata[start_idx:end_idx]

385
380
385


Unnamed: 0,title,link,img,id
What's Your Cat's Story?,What's Your Cat's Story?,https://www.youtube.com/watch?v=iMSQg8eEXHw&li...,https://i.ytimg.com/vi/iMSQg8eEXHw/hqdefault.j...,382.0
Meet Jackson Galaxy's Cats (and dog),Meet Jackson Galaxy's Cats (and dog),https://www.youtube.com/watch?v=gYoRNwIMAV8&li...,https://i.ytimg.com/vi/gYoRNwIMAV8/hqdefault.j...,383.0
The Story of Mojo The Cat,The Story of Mojo The Cat,https://www.youtube.com/watch?v=BjmqAazOdow&li...,https://i.ytimg.com/vi/BjmqAazOdow/hqdefault.j...,384.0
How To Make Your Cat Not Afraid of Strangers,How To Make Your Cat Not Afraid of Strangers,https://www.youtube.com/watch?v=ihPERHMKQeg&li...,https://i.ytimg.com/vi/ihPERHMKQeg/hqdefault.j...,385.0
The Politics of Litter,The Politics of Litter,https://www.youtube.com/watch?v=34cHfvYOYS0&li...,https://i.ytimg.com/vi/34cHfvYOYS0/hqdefault.j...,386.0


In [5]:
urls=list(current_batch.link)
save_dir = "../Downloads/JacksonGalaxy_" + str(batch_number)
openai.api_key = os.environ["OPENAI_API_KEY"]
loader = GenericLoader(YoutubeAudioLoader(urls,save_dir),OpenAIWhisperParser())
docs = loader.load()

[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=iMSQg8eEXHw&list=UUheL-cUqfzUB8dfM_rFOfDQ&index=760&pp=iAQB
[youtube:tab] Downloading just the video iMSQg8eEXHw because of --no-playlist
[youtube] Extracting URL: https://www.youtube.com/watch?v=iMSQg8eEXHw
[youtube] iMSQg8eEXHw: Downloading webpage
[youtube] iMSQg8eEXHw: Downloading ios player API JSON
[youtube] iMSQg8eEXHw: Downloading android player API JSON
[youtube] iMSQg8eEXHw: Downloading m3u8 information
[info] iMSQg8eEXHw: Downloading 1 format(s): 140
[download] Destination: ../Downloads/JacksonGalaxy_39/What's Your Cat's Story？.m4a
[download] 100% of    2.99MiB in 00:00:00 at 8.15MiB/s   
[FixupM4a] Correcting container of "../Downloads/JacksonGalaxy_39/What's Your Cat's Story？.m4a"
[ExtractAudio] Not converting audio ../Downloads/JacksonGalaxy_39/What's Your Cat's Story？.m4a; file is already in target format m4a
[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=gYoRNwIMAV8&list=UUheL-cUqfzUB8dfM_rFOfD

In [6]:
# Consolidate text per episode
concatenated_text_by_source = {}
for doc in docs:
    source = doc.metadata['source']
    page_content = doc.page_content
    if source in concatenated_text_by_source:
        concatenated_text_by_source[source] += ' ' + page_content
    else:
        concatenated_text_by_source[source] = page_content

In [7]:
# Split each video and add metadata we'll use in UI
splits=[]
metadatas=[]

# Split parameters
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 150)

# Build splits
for k in concatenated_text_by_source.keys():
    
    # Get downloaded title
    directory, filename = os.path.split(k)
    video_name = filename.split(".m4a")[0]    
    
    # Make splits
    episode_text = concatenated_text_by_source[k]
    episode_splits = text_splitter.split_text(episode_text)
    splits.append(episode_splits)
    
    # finding closest name instead of exact match to circumvent special character issues
    closest_name = difflib.get_close_matches(video_name, list(stor_metadata['title']))
    video_name_corrected = closest_name[0]
    print(video_name)
    print(closest_name)
    print(video_name_corrected)

    
    # With title, we can fetch associated metadata we wrote earlier to stor_metadata
    episode_number = str(stor_metadata.loc[video_name_corrected,"id"])
    episode_link = stor_metadata.loc[video_name_corrected,"link"] 
    img_url = stor_metadata.loc[video_name_corrected,"img"] 
    
    # Save the video thumbnail for later use the UI
    with open("../nextjs/public/%s.jpg"%str(episode_number), 'wb') as f:
        response = requests.get(img_url)
        f.write(response.content)
        #print(f)
        print(" ")
    
    # Save metadata
    episode_metadatas=[{"source":video_name + " " +episode_number,"id":episode_number,"link":episode_link,"title":video_name} for s in episode_splits]
    metadatas.append(episode_metadatas)

Meet Jackson Galaxy's Cats (and dog)
["Meet Jackson Galaxy's Cats (and dog)", "Jackson Galaxy's Star Search!"]
Meet Jackson Galaxy's Cats (and dog)
 
The Story of Mojo The Cat
['The Story of Mojo The Cat']
The Story of Mojo The Cat
 
What's Your Cat's Story？
["What's Your Cat's Story?"]
What's Your Cat's Story?
 
The Politics of Litter
['The Politics of Litter']
The Politics of Litter
 
How To Make Your Cat Not Afraid of Strangers
['How To Make Your Cat Not Afraid of Strangers', 'My Cat is Afraid of Strangers', 'How To Get Your Cat Into a Pet Carrier']
How To Make Your Cat Not Afraid of Strangers
 


In [8]:
# Join the list of lists 
splits_all = []
for sublist in splits:
    splits_all.extend(sublist)
metadatas_all = []
for sublist in metadatas:
    metadatas_all.extend(sublist)

In [9]:
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment="us-west4-gcp-free")

index_name = "cat-gpt"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

#if batch_number == 2: 
    #p.delete(delete_all=True)

In [10]:
# Add data in chunk to avoid data ingest errors
chunk_size = 100
last_chunk = 0
num_chunks = math.ceil(len(splits_all) / chunk_size)

for i in range(last_chunk, num_chunks):
    # Set chunks to add
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, len(splits_all))
    
    # Extract the current chunks
    current_splits = splits_all[start_idx:end_idx]
    current_metadatas = metadatas_all[start_idx:end_idx]
    
    # Add the current chunk to the vector database
    p.add_texts(texts = current_splits, metadatas=current_metadatas)

Upserted vectors:   0%|          | 0/12 [00:00<?, ?it/s]

In [11]:
print('done')

done
