<a href="https://colab.research.google.com/github/atilatech/atlas-service/blob/master/notebooks/search_youtube_transcripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Youtube Search
First, we'll convert a list of videos to mp3 using [pytube](https://github.com/pytube/pytube)

In [1]:
!pip install pytube
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytube
  Downloading pytube-12.1.2-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 3.5 MB/s 
[?25hInstalling collected packages: pytube
Successfully installed pytube-12.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# where to save
save_path = "./mp3"

# videos = [
#     {
#         "id": "8rWIsc34N9Y",
#         "title": "The Network State with Balaji Srinivasan",
#     },
#     {
#         "id": "ycPr5-27vSI",
#         "title": "Joe Rogan Experience #1169 - Elon Musk",
#     }
# ]

videos = [
    {
        'id': 'ndnmyBaKVd8',
        'title': 'The Network State with Balaji Srinivasan | #𝗦𝗔𝗟𝗧𝗡𝗬',
    },
    {
        'id': 'bGk8qcHc1A0',
        'title': 'Joe Rogan & Lex Fridman: Lionel Messi Is The GOAT Over Cristiano Ronaldo',
    },
]

In [3]:
from pytube import YouTube  # !pip install pytube
from pytube.exceptions import RegexMatchError
from tqdm.auto import tqdm  # !pip install tqdm
from pathlib import Path
# inspired by https://www.pinecone.io/learn/openai-whisper/

yt_videos = []
for video in tqdm(videos):
    # url of video to be downloaded
    url = f"https://www.youtube.com/watch?v={video['id']}"

    # try to create a YouTube vid object
    try:
        yt = YouTube(url)
        yt_videos.append(yt)
    except RegexMatchError:
        print(f"RegexMatchError for '{url}'")
        continue

    itag = None
    # we only want audio files
    files = yt.streams.filter(only_audio=True)
    for file in files:
        # from audio files we grab the first audio for mp4 (eg mp3)
        if file.mime_type == 'audio/mp4':
            itag = file.itag
            break
    if itag is None:
        # just incase no MP3 audio is found (shouldn't happen)
        print("NO MP3 AUDIO FOUND")
        continue

    # get the correct mp3 'stream'
    stream = yt.streams.get_by_itag(itag)
    # downloading the audio
    stream.download(
        output_path=save_path,
        filename=f"{video['id']}.mp3"
    )

  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
yt.views

182578

# Convert MP3 to Transcripts Using Whisper

In [5]:
!pip install git+https://github.com/openai/whisper.git 
# whisper also needs ffmpeg
!apt install ffmpeg

# optional install pytorch so you can use a gpu for faster transcription
# command below is for Linux. See instructions for mac and windows: https://pytorch.org/get-started/locally/
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-ciwc4gdd
  Running command git clone -q https://github.com/openai/whisper.git /tmp/pip-req-build-ciwc4gdd
Collecting transformers>=4.19.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.7 MB/s 
[?25hCollecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 52.1 MB/s 
Building wheels for collected packages: whisper
  Build

In [11]:
import whisper
import torch  # install steps: pytorch.org

# if you have access to a gpu you can include this check, else just use cpu
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper.available_models()
tiny_model_en = whisper.load_model("tiny").to(device)
large_model = whisper.load_model("large").to(device)

100%|█████████████████████████████████████| 2.87G/2.87G [00:50<00:00, 60.6MiB/s]


In [None]:


large_model = whisper.load_model("large").to(device)
tiny_model = whisper.load_model("tiny").to(device)
tiny_model_en = whisper.load_model("tiny").to(device)

100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 120MiB/s]


In [8]:
paths = [str(x) for x in Path('./mp3').glob('*.mp3')]
paths

['mp3/ndnmyBaKVd8.mp3', 'mp3/bGk8qcHc1A0.mp3']

In [12]:
from tqdm.auto import tqdm  # !pip install tqdm
# get list of MP3 audio files
paths = [str(x) for x in Path('./mp3').glob('*.mp3')]
print(len(paths))
print(paths[:5])

videos_dict = {video['id']: video for video in videos}
video_transcripts = []
for i, path in enumerate(tqdm(paths)):
    _id = path.split('/')[-1][:-4]
    # transcribe to get speech-to-text data
    result = large_model.transcribe(path)
    # get the video metadata...
    segments = result['segments']
    video_meta = videos_dict[_id]
    for segment_index, segment in enumerate(segments):
        # merge segments data and videos_meta data
        transcript_meta = {
            **video_meta,
            **{
                "id": f"{_id}-t{segments[segment_index]['start']}",
                "text": segment["text"].strip(),
                "start": segment['start'],
                "end": segment['end']
            }
        }
        video_transcripts.append(transcript_meta)

2
['mp3/ndnmyBaKVd8.mp3', 'mp3/bGk8qcHc1A0.mp3']


  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
import json

with open("video-transcripts-large.jsonl", "w", encoding="utf-8") as fp:
    for line in tqdm(video_transcripts):
        json.dump(line, fp)
        fp.write('\n')

  0%|          | 0/260 [00:00<?, ?it/s]

# Transcripts to Vectors

Now we'll convert those transcripts into embeddings that will allow them to be searched.

# Stride Text (Overlap Text)

Each chunk itself is too short to be useful, so we'll combine the text together using a technique called striding.

## Without striding
![without striding](https://d33wubrfki0l68.cloudfront.net/a0373a295dbbbe3c3ca686687b47a4e6c1aba11b/0c6f2/images/openai-whisper-4.png)



---


## With striding

![With striding](https://d33wubrfki0l68.cloudfront.net/06f2cfe89c666aeddb1204b9741ae8a964460fb3/f6190/images/openai-whisper-5.png)


In [16]:
video_transcripts[0]

{'id': 'ndnmyBaKVd8-t0.0',
 'title': 'The Network State with Balaji Srinivasan | #𝗦𝗔𝗟𝗧𝗡𝗬',
 'text': "I'm here to talk to you today about something I'm calling the Network State, something you",
 'start': 0.0,
 'end': 13.32}

In [21]:
from tqdm.auto import tqdm

video_transcripts_overlap = []

window = 6  # number of sentences to combine
stride = 3  # number of sentences to 'stride'/overlap over

for i in tqdm(range(0, len(video_transcripts), stride)):
    i_end = min(len(video_transcripts)-1, i+window)
    if video_transcripts[i]['title'] != video_transcripts[i_end]['title']:
        # in this case we skip this entry as we have start/end of two videos
        continue
    text = ' '.join(transcript['text'] 
                    for transcript in
                    video_transcripts[i:i_end])
    video_transcripts_overlap.append({
        'start': video_transcripts[i]['start'],
        'end': video_transcripts[i_end]['end'],
        'title': video_transcripts[i]['title'],
        'text': text,
        'id': video_transcripts[i]['id'],
    })

video_transcripts_overlap[0]

  0%|          | 0/87 [00:00<?, ?it/s]

[{'start': 0.0,
  'end': 29.560000000000002,
  'title': 'The Network State with Balaji Srinivasan | #𝗦𝗔𝗟𝗧𝗡𝗬',
  'text': "I'm here to talk to you today about something I'm calling the Network State, something you may have already seen in the book. But the basic premise is actually very simple, which is we've started new currencies. Could we start new countries? How do we actually go about that? Well, as I mentioned, there's a long-form version.",
  'id': 'ndnmyBaKVd8-t0.0'}]

In [22]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 15.2 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=7390c4c7e8c86d2d176e59c8874e6dafa309bc2aa1de00ce92b1ddb316e79828
  Stored in directory: /root/.cache/pip/wheels/5e/6f/8c/d88aec621f3f542d26fac0342bef5e693335d125f4e54aeffe
Successfully built sentence-transformers
Installing collected packages: sentencepiece, sentence-transformers
Successfully installed sentence

In [31]:
from sentence_transformers import SentenceTransformer

# model_id = "all-MiniLM-L6-v2"
# See other available models:
# https://www.sbert.net/docs/pretrained_models.html
# https://huggingface.co/models?library=sentence-transformers

model_id = "multi-qa-MiniLM-L6-cos-v1"

sentence_transformer_model = SentenceTransformer(model_id)
sentence_transformer_model

mpnet_model_id = "all-mpnet-base-v2"

mpnet_sentence_transformer_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [30]:
dim = sentence_transformer_model.get_sentence_embedding_dimension()
dim


384

In [52]:
from sentence_transformers import SentenceTransformer

mpnet_model_id = "all-mpnet-base-v2"

mpnet_sentence_transformer_model = SentenceTransformer(mpnet_model_id)

In [53]:
mpnet_dim = mpnet_sentence_transformer_model.get_sentence_embedding_dimension()
mpnet_dim

768

# Save Transcript Vectors to Vector Database

In [27]:
!pip install pinecone-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pinecone-client
  Downloading pinecone_client-2.0.13-py3-none-any.whl (175 kB)
[K     |████████████████████████████████| 175 kB 6.2 MB/s 
Collecting loguru>=0.5.0
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 6.3 MB/s 
Installing collected packages: loguru, pinecone-client
Successfully installed loguru-0.6.0 pinecone-client-2.0.13


In [57]:
import pinecone  # !pip install pinecone-client

index_id = "youtube-search"

pinecone.init(
    # get api key at app.pinecone.io, note, it should be kept as a secret in an environment variable
    api_key="<pinecone_api_key>",  
    environment="us-west1-gcp"
)

if index_id not in pinecone.list_indexes():
    pinecone.create_index(
        index_id,
        mpnet_dim,
        metric="dotproduct"
    )

index = pinecone.Index(index_id)
index.describe_index_stats()


ForbiddenException: ignored

In [58]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [59]:
# we encode and insert in batches of 64
batch_size = 64

# loop through in batches of 64
for i in tqdm(range(0, len(video_transcripts_overlap), batch_size)):
    # find end position of batch (for when we hit end of data)
    i_end = min(len(video_transcripts_overlap)-1, i+batch_size)
    # extract the metadata like text, start/end positions, etc
    batch_meta = [{
        "text": video_transcripts_overlap[x]["text"],
        "start": video_transcripts_overlap[x]["start"],
        "end": video_transcripts_overlap[x]["end"],
        # "url": video_transcripts_overlap[x]["url"],
        "title": video_transcripts_overlap[x]["title"]
    } for x in range(i, i_end)]
    # extract only text to be encoded by embedding model
    batch_text = [
        row['text'] for row in video_transcripts_overlap[i:i_end]
    ]
    # create the embedding vectors
    batch_embeds = mpnet_sentence_transformer_model.encode(batch_text, 
                                      show_progress_bar=True).tolist()
    # extract IDs to be attached to each embedding and metadata
    batch_ids = [
        row['id'] for row in video_transcripts_overlap[i:i_end]
    ]
    # 'upsert' (insert) IDs, embeddings, and metadata to index
    to_upsert = list(zip(
        batch_ids, batch_embeds, batch_meta
    ))
    index.upsert(to_upsert)

  0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
# check that everything has been added
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 84}},
 'total_vector_count': 84}

# Try Search

In [76]:
query = "basketball"

xq = mpnet_sentence_transformer_model.encode(query).tolist()

xq
index.query(xq, top_k=5, include_metadata=True)

{'matches': [{'id': 'bGk8qcHc1A0-t12.96',
              'metadata': {'end': 48.66,
                           'start': 12.96,
                           'text': 'things that divide the populace with the '
                                   "vaccines it's probably Messi versus "
                                   "Cristiano Ronaldo who's the greatest "
                                   'player of all time or currently playing '
                                   'that divides people too, but what is the '
                                   "division? There's just both incredible, "
                                   "but what is it like who's better LeBron or "
                                   'Michael Jordan? Yeah, like that kind of '
                                   'people get very passionate about the '
                                   'extremely passionate Who do you think is '
                                   'better Messi by far?',
                           'title': 'Jo