In [None]:
#!pip install youtube-transcript-api
#!pip install google-api-python-client
#!pip install pinecone-client
#!pip install sentence-transformers
!pip install numpy
#!pip install transformers -U

In [42]:
from googleapiclient.discovery import build
from datetime import timedelta
from datasets import load_dataset
from youtube_transcript_api import YouTubeTranscriptApi

import pandas as pd
import time
import pinecone
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from pinecone import Pinecone, ServerlessSpec


In [None]:
#Google youtube video id downloading API
# Replace with your API key
#Go to the Google Cloud Console.
#Create a new project.
#Enable the YouTube Data API v3 for your project.
#Create API credentials (API key) under APIs & Services > Credentials.
api_key = {'your api key'}

# Initialize the YouTube Data API v3
youtube = build('youtube', 'v3', developerKey=api_key)

def search_videos():
    videos = []
    next_page_token = None
    total_videos = 0
    
    while total_videos < 5000:
        search_response = youtube.search().list(
            q='tech',
            part='snippet',
            type='video',
            maxResults=min(50, 5000 - total_videos),  # Ensure we don't exceed 5000 videos
            videoDuration='short',  # Filter for short videos (< 4 minutes)
            relevanceLanguage='en',  # Videos in English
            videoEmbeddable='true',  # Filter for embeddable videos
            videoCategoryId='28',  # Category ID for Science & Technology
            pageToken=next_page_token
        ).execute()

        for search_result in search_response.get('items', []):
            video_id = search_result['id']['videoId']
            title = search_result['snippet']['title']
            description = search_result['snippet']['description']
            published_at = search_result['snippet']['publishedAt']
            
            video_info = {
                'video_id': video_id,
                'title': title,
                'description': description,
                'published_at': published_at
            }
            
            videos.append(video_info)
            total_videos += 1

            if total_videos >= 5000:
                break

        next_page_token = search_response.get('nextPageToken')

        if not next_page_token:
            break

        # YouTube Data API has a rate limit of 10,000 units per day per project
        # We need to handle rate limiting by waiting before making the next request
        time.sleep(1)  # Wait for 1 second to avoid hitting rate limits

    return videos[:5000]  # Return up to 5000 videos

video_data = search_videos()
video_ids=[]
for i in video_data:
    video_ids.append(i['video_id'])

In [167]:

# retrieve the available transcripts
df = pd.DataFrame(columns=['id','text','duration'])
transcripts = {}
for i in range(0,len(video_ids)):
    try:
        df.loc[i,'id'] = video_ids[i]
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_ids[i])
        text_field = ''
        for transcript in transcript_list:
            list_trans = transcript.translate('en').fetch()
            for k in range(0,len(list_trans)):
                text_field = text_field +' '+ list_trans[k]['text']
                duration = list_trans[k]['start'] +list_trans[k]['duration']
        df.loc[i,'text'] = text_field
        df.loc[i,'duration'] = duration
    except:
        pass



In [168]:
df = df.dropna()
df.shape

(522, 3)

In [169]:
df.head()

Unnamed: 0,id,text,duration
0,EL5GxUuvFak,"This is the world's smallest cyber truck, thi...",52.429
1,11OQcqP4JDc,this is what happens when I turn on notifica...,3.32
2,hGYS-NCNon0,$50 fake iPhone $400 fake iPhone how close c...,4.319
3,wGQHHpemIjk,the redmi note 12 Discovery is one of the fa...,3.51
5,jbe6yA0xipU,All three of us bought the most expensive sma...,61.16


In [172]:
#Average duration of the video files
df.duration.mean()

np.float64(26.055358237547917)

In [49]:
#pinecone
pc = Pinecone(api_key={'your api key'})

In [None]:
pc.create_index(
    name="quickstart",
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [50]:
pc.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'quickstart-xfckm4t.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'quickstart',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [51]:
idx = pc.Index('quickstart')

In [52]:
df = df.rename(columns = {'id':'video_id'})

In [53]:
df = df.reset_index().drop('index',axis=1)

In [54]:
#Sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2') # cuda or cpu

In [55]:
df['values'] = df['text'].map(
    lambda x: (model.encode(x)).tolist())
df['id'] = df.reset_index(drop = 'index').index
df['metadata'] = df.apply(lambda x: {
    'text' : x['text'],
    'duration': x['duration'],
    'video_id': x['video_id']
    
}, axis=1)
df_upsert = df[['id', 'values', 'metadata']]

In [187]:
df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))


In [192]:
idx.upsert_from_dataframe(df_upsert) 

sending upsert requests: 100%|██████████| 522/522 [00:03<00:00, 154.97it/s]


{'upserted_count': 522}

In [56]:
xc = idx.query(vector=(model.encode("tell me about the iphone")).tolist(), # python list
           top_k=10,
           include_metadata=True) 

In [57]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}: {result['metadata']['video_id']} ")

0.64:  This is a pretty big iPhone 14 plus.  But how big do you think the biggest iPhone in the world is?  So I wasn't expecting this, but YouTuber Matthew Beem told me he was building the world's largest iPhone a few days ago and then he showed up with it.  This thing is fully functional, stands 7 feet tall, and weighs almost 500 pounds. There's an on/off button here.  There are also volume buttons and a mute button.  It's so high that you have to climb a ladder to take a selfie with the camera because it's right up here.  But you know, I'm the tech YouTuber, so I had to do a little digging into how he did it.  And first of all, watch his video to see my full reaction to it.  But it turns out there's no iPhone here.  It's actually a Mac Mini here, which simulates running a virtual Android phone, and then that Android phone is running a launcher that makes it look like an iPhone, all with some improvised technology around the edges to make it see where your  finger is to make it work a