In [1]:
# for fetching list of youtube videos
from youtubesearchpython import *
import pandas as pd

# for downloading audio and extracting text
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# for saving embedding to vectordb
import os
import math
import pinecone
import pandas as pd
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

import openai
import requests

import unicodedata
import difflib # for fuzzy string matching

  from tqdm.autonotebook import tqdm


In [2]:
# channels = ChannelsSearch('JacksonGalaxy')
# print(channels.result())
channel_id = "UCheL-cUqfzUB8dfM_rFOfDQ" #pull out id from channels above
playlist = Playlist(playlist_from_channel_id(channel_id))

while playlist.hasMoreVideos:
    print('Getting more videos...')
    playlist.getNextVideos()
    print(f'Videos retrieved: {len(playlist.videos)}')

Getting more videos...
Videos retrieved: 200
Getting more videos...
Videos retrieved: 300
Getting more videos...
Videos retrieved: 400
Getting more videos...
Videos retrieved: 500
Getting more videos...
Videos retrieved: 599
Getting more videos...
Videos retrieved: 699
Getting more videos...
Videos retrieved: 761


In [3]:
stor_metadata = pd.DataFrame()

# Excluding 'shorts' based on duration of videos
#from datetime import datetime, time
import time
time_format = '%H:%M:%S'
one_min = time.strptime('00:01:00', time_format)

count = 1
for v in playlist.videos:
    duration_str = v['duration']
    if len(duration_str) < 6:
        duration_str = '0:' + duration_str
    duration = time.strptime(duration_str, time_format)
    #print(duration_str, duration)
    if duration > one_min:
        #print(">>>LONG VIDEO")
        stor_metadata.loc[v['title'],'title']=v['title']
        stor_metadata.loc[v['title'],'link']=v['link']
        # using unicode data to fix issue with filenames
        # title_corrected = unicodedata.normalize('NFKD', v['title'])
        stor_metadata.loc[v['title'],'img']=v['thumbnails'][3]['url']
        stor_metadata.loc[v['title'],'id']=int(count)
        #print(count)
        count = count + 1        

In [4]:
#print(stor_metadata)


# Download, transcribe, and embed in batches to avoid connection errors
batch_size = 10
num_batches = math.ceil(len(stor_metadata) / batch_size)

batch_number = 15 # batch 13, numbers 120-130 skipped # manually updated
# Set chunks to add
start_idx = (batch_number-1) * batch_size
end_idx = min(start_idx + batch_size, len(stor_metadata))
    
# Extract the current chunks
current_batch = stor_metadata[start_idx:end_idx]
print(start_idx)
print(end_idx)
stor_metadata[start_idx:end_idx]

140
150


Unnamed: 0,title,link,img,id
Post Vet Cat Aggression Explained,Post Vet Cat Aggression Explained,https://www.youtube.com/watch?v=-OmMDhK2FxQ&li...,https://i.ytimg.com/vi/-OmMDhK2FxQ/hqdefault.j...,142.0
How your mask-wearing affects cats!,How your mask-wearing affects cats!,https://www.youtube.com/watch?v=RVGAlC9Z_VQ&li...,https://i.ytimg.com/vi/RVGAlC9Z_VQ/hqdefault.j...,143.0
Can a Laser Toy Make Your Cat Crazy?!,Can a Laser Toy Make Your Cat Crazy?!,https://www.youtube.com/watch?v=iMk0QDjQ9Wk&li...,https://i.ytimg.com/vi/iMk0QDjQ9Wk/hqdefault.j...,144.0
Why Does My Cat Hump Everything?,Why Does My Cat Hump Everything?,https://www.youtube.com/watch?v=p_V3inwZHt8&li...,https://i.ytimg.com/vi/p_V3inwZHt8/hqdefault.j...,145.0
Single Kitten Syndrome is a Thing,Single Kitten Syndrome is a Thing,https://www.youtube.com/watch?v=_3rh9DdY0yk&li...,https://i.ytimg.com/vi/_3rh9DdY0yk/hqdefault.j...,146.0
Does Your Cat RUN OUT of the litter box?,Does Your Cat RUN OUT of the litter box?,https://www.youtube.com/watch?v=MdGZmbvfl5k&li...,https://i.ytimg.com/vi/MdGZmbvfl5k/hqdefault.j...,147.0
Should I Brush My Cat’s Teeth?,Should I Brush My Cat’s Teeth?,https://www.youtube.com/watch?v=V9oo8mu2K30&li...,https://i.ytimg.com/vi/V9oo8mu2K30/hqdefault.j...,148.0
My Cat Doesn't Like to Be Touched!,My Cat Doesn't Like to Be Touched!,https://www.youtube.com/watch?v=gi3z3sy7twk&li...,https://i.ytimg.com/vi/gi3z3sy7twk/hqdefault.j...,149.0
Why Do Cats Put Their Butt in Your Face?,Why Do Cats Put Their Butt in Your Face?,https://www.youtube.com/watch?v=cHxCRxiaaYA&li...,https://i.ytimg.com/vi/cHxCRxiaaYA/hqdefault.j...,150.0
How to Stop Your Kitten Chewing: Tips & Toys,How to Stop Your Kitten Chewing: Tips & Toys,https://www.youtube.com/watch?v=ERku0zqNatE&li...,https://i.ytimg.com/vi/ERku0zqNatE/hqdefault.j...,151.0


In [5]:
urls=list(current_batch.link)
save_dir = "../Downloads/JacksonGalaxy_" + str(batch_number)
openai.api_key = os.environ["OPENAI_API_KEY"]
loader = GenericLoader(YoutubeAudioLoader(urls,save_dir),OpenAIWhisperParser())
docs = loader.load()

[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=-OmMDhK2FxQ&list=UUheL-cUqfzUB8dfM_rFOfDQ&index=509&pp=iAQB
[youtube:tab] Downloading just the video -OmMDhK2FxQ because of --no-playlist
[youtube] Extracting URL: https://www.youtube.com/watch?v=-OmMDhK2FxQ
[youtube] -OmMDhK2FxQ: Downloading webpage
[youtube] -OmMDhK2FxQ: Downloading ios player API JSON
[youtube] -OmMDhK2FxQ: Downloading android player API JSON
[youtube] -OmMDhK2FxQ: Downloading m3u8 information
[info] -OmMDhK2FxQ: Downloading 1 format(s): 140
[download] Destination: ../Downloads/JacksonGalaxy_15/Post Vet Cat Aggression Explained.m4a
[download] 100% of    7.84MiB in 00:00:00 at 21.50MiB/s  
[FixupM4a] Correcting container of "../Downloads/JacksonGalaxy_15/Post Vet Cat Aggression Explained.m4a"
[ExtractAudio] Not converting audio ../Downloads/JacksonGalaxy_15/Post Vet Cat Aggression Explained.m4a; file is already in target format m4a
[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=RVGAlC9Z_VQ&l

[ExtractAudio] Not converting audio ../Downloads/JacksonGalaxy_15/Why Do Cats Put Their Butt in Your Face？.m4a; file is already in target format m4a
[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=ERku0zqNatE&list=UUheL-cUqfzUB8dfM_rFOfDQ&index=518&pp=iAQB
[youtube:tab] Downloading just the video ERku0zqNatE because of --no-playlist
[youtube] Extracting URL: https://www.youtube.com/watch?v=ERku0zqNatE
[youtube] ERku0zqNatE: Downloading webpage
[youtube] ERku0zqNatE: Downloading ios player API JSON
[youtube] ERku0zqNatE: Downloading android player API JSON
[youtube] ERku0zqNatE: Downloading m3u8 information
[info] ERku0zqNatE: Downloading 1 format(s): 140
[download] Destination: ../Downloads/JacksonGalaxy_15/How to Stop Your Kitten Chewing： Tips & Toys.m4a
[download] 100% of    9.14MiB in 00:00:00 at 25.94MiB/s  
[FixupM4a] Correcting container of "../Downloads/JacksonGalaxy_15/How to Stop Your Kitten Chewing： Tips & Toys.m4a"
[ExtractAudio] Not converting audio ../Downloa

In [6]:
# Consolidate text per episode
concatenated_text_by_source = {}
for doc in docs:
    source = doc.metadata['source']
    page_content = doc.page_content
    if source in concatenated_text_by_source:
        concatenated_text_by_source[source] += ' ' + page_content
    else:
        concatenated_text_by_source[source] = page_content

In [7]:
# Split each video and add metadata we'll use in UI
splits=[]
metadatas=[]

# Split parameters
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 150)

# Build splits
for k in concatenated_text_by_source.keys():
    
    # Get downloaded title
    directory, filename = os.path.split(k)
    video_name = filename.split(".m4a")[0]    
    
    # Make splits
    episode_text = concatenated_text_by_source[k]
    episode_splits = text_splitter.split_text(episode_text)
    splits.append(episode_splits)
    
    # finding closest name instead of exact match to circumvent special character issues
    closest_name = difflib.get_close_matches(video_name, list(stor_metadata['title']))
    video_name_corrected = closest_name[0]
    print(video_name)
    print(closest_name)
    print(video_name_corrected)

    
    # With title, we can fetch associated metadata we wrote earlier to stor_metadata
    episode_number = str(stor_metadata.loc[video_name_corrected,"id"])
    episode_link = stor_metadata.loc[video_name_corrected,"link"] 
    img_url = stor_metadata.loc[video_name_corrected,"img"] 
    
    # Save the video thumbnail for later use the UI
    with open("../nextjs/public/%s.jpg"%str(episode_number), 'wb') as f:
        response = requests.get(img_url)
        f.write(response.content)
        #print(f)
        print(" ")
    
    # Save metadata
    episode_metadatas=[{"source":video_name + " " +episode_number,"id":episode_number,"link":episode_link,"title":video_name} for s in episode_splits]
    metadatas.append(episode_metadatas)

Can a Laser Toy Make Your Cat Crazy？!
['Can a Laser Toy Make Your Cat Crazy?!', '3 Steps To Make Your Cat More Social']
Can a Laser Toy Make Your Cat Crazy?!
 
Should I Brush My Cat’s Teeth？
['Should I Brush My Cat’s Teeth?']
Should I Brush My Cat’s Teeth?
 
How your mask-wearing affects cats!
['How your mask-wearing affects cats!']
How your mask-wearing affects cats!
 
Post Vet Cat Aggression Explained
['Post Vet Cat Aggression Explained', '8 Types of Cat Aggression Explained!']
Post Vet Cat Aggression Explained
 
Why Does My Cat Hump Everything？
['Why Does My Cat Hump Everything?']
Why Does My Cat Hump Everything?
 
How to Stop Your Kitten Chewing： Tips & Toys
['How to Stop Your Kitten Chewing: Tips & Toys']
How to Stop Your Kitten Chewing: Tips & Toys
 
Single Kitten Syndrome is a Thing
['Single Kitten Syndrome is a Thing']
Single Kitten Syndrome is a Thing
 
My Cat Doesn't Like to Be Touched!
["My Cat Doesn't Like to Be Touched!"]
My Cat Doesn't Like to Be Touched!
 
Does Your Cat 

In [8]:
# Join the list of lists 
splits_all = []
for sublist in splits:
    splits_all.extend(sublist)
metadatas_all = []
for sublist in metadatas:
    metadatas_all.extend(sublist)

In [9]:
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment="us-west4-gcp-free")

index_name = "cat-gpt"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

#if batch_number == 2: 
    #p.delete(delete_all=True)

In [10]:
# Add data in chunk to avoid data ingest errors
chunk_size = 100
last_chunk = 0
num_chunks = math.ceil(len(splits_all) / chunk_size)

for i in range(last_chunk, num_chunks):
    # Set chunks to add
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, len(splits_all))
    
    # Extract the current chunks
    current_splits = splits_all[start_idx:end_idx]
    current_metadatas = metadatas_all[start_idx:end_idx]
    
    # Add the current chunk to the vector database
    p.add_texts(texts = current_splits, metadatas=current_metadatas)

Upserted vectors:   0%|          | 0/69 [00:00<?, ?it/s]

In [11]:
print('done')

done
