<a href="https://colab.research.google.com/github/atilatech/atlas-service/blob/master/notebooks/deploy_whisper_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deploy Whisper Model to HuggingFace

### Install Dependencies

In [None]:
!pip install transformers pytube

# optional install pytorch so you can use a gpu for faster transcription
# command below is for Linux. See instructions for mac and windows: https://pytorch.org/get-started/locally/
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

!pip install git+https://github.com/openai/whisper.git -q
!apt install ffmpeg # https://stackoverflow.com/questions/51856340/how-to-install-package-ffmpeg-in-google-colab

# Create a Custom inference handler

In [43]:
import whisper
import torch
import pytube
from typing import  Dict
import time


class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        MODEL_NAME = "tiny.en"
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f'whisper will use: {device}')
        
        t0 = time.time()
        self.model = whisper.load_model(MODEL_NAME).to(device)
        t1 = time.time()
        
        total = t1-t0
        print(f'Finished loading model in {total} seconds')


    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the URL to video for transcription
        Return:
            A :obj:`dict`:. transcribed dict
        """
        # process input
        print('data', data)
        video_url = data.pop("inputs", data)
        decode_options = {
            # Set language to None to support multilingual, 
            # but it will take longer to process while it detects the language.
            # Realized this by running in verbose mode and seeing how much time
            # was spent on the decoding language step
            "language":"en",
            "verbose": True
        }
        yt = pytube.YouTube(video_url)
        video_info = {
            'id': yt.video_id,
            'thumbnail': yt.thumbnail_url,
            'title': yt.title,
            'views': yt.views,
            'length': yt.length,
            # redundant since we already have id but it allows the 
            # link to the video be accessed in 1-click in the API response
            'url': f"https://www.youtube.com/watch?v={yt.video_id}"
        }
        stream = yt.streams.filter(only_audio=True)[0]
        path_to_audio = f"{yt.video_id}.mp3"
        stream.download(filename=path_to_audio)
        t0 = time.time()
        transcript = self.model.transcribe(path_to_audio, **decode_options)
        t1 = time.time()
        for segment in transcript['segments']:
          # Remove the tokens field
          segment.pop('tokens', None)
        
        total = t1-t0
        print(f'Finished transcription in {total} seconds')

        # postprocess the prediction
        return {"transcript": transcript, 'video': video_info}


In [None]:
# Use Handler

my_handler = EndpointHandler(path=".")

# prepare sample payload
payload = {"inputs": "https://www.youtube.com/watch?v=aNxigRg1yEQ"}

# test the handler
payload_pred=my_handler(payload)
payload_pred

In [None]:
payload_pred

# Vectorize Transcripts

1. Call the deployed API we created in the last step

1. Combine 6 segments together to create more meaningful sentences

1. Embed sentences into vectors using transformers

1. Save vectors into a vector database

1. Query phrases using vector database

1. [Fixing YouTube Search with OpenAI's Whisper](https://www.pinecone.io/learn/openai-whisper/)

In [36]:
!pip install requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [49]:
from getpass import getpass

HUGGING_FACE_ENDPOINT_URL = getpass('Enter HUGGING_FACE_ENDPOINT_URL')
HUGGING_FACE_API_KEY = getpass('Enter HUGGING_FACE_API_KEY')


Enter HUGGING_FACE_ENDPOINT_URL··········
Enter HUGGING_FACE_API_KEY··········


In [50]:
import json
from typing import List
import requests
import base64
import mimetypes

def send_transcription_request(url:str=None):
    payload = json.dumps({
      "inputs": video_url
    })
    headers = {
      'Authorization': f'Bearer {HUGGING_FACE_API_KEY}',
      'Content-Type': 'application/json'
    }

    response = requests.request("POST", HUGGING_FACE_ENDPOINT_URL, headers=headers, data=payload)
    return response.json()

video_url="https://www.youtube.com/watch?v=bGk8qcHc1A0" # Joe Rogan & Lex Fridman: Lionel Messi Is The GOAT Over Cristiano Ronaldo
video_data = send_transcription_request(video_url)

In [52]:
# verify that it worked
video_data['transcript']['segments'][0]

{'id': 0,
 'seek': 0,
 'start': 0.0,
 'end': 1.56,
 'text': " You're a part of mixed martial arts.",
 'temperature': 0.0,
 'avg_logprob': -0.3029073941505561,
 'compression_ratio': 1.6091549295774648,
 'no_speech_prob': 0.13397082686424255}

## Combine Transcript Segments

In [40]:
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [61]:
from tqdm.auto import tqdm

new_transcript_segments = []

def combine_transcripts(video):
  window = 6  # number of sentences to combine
  stride = 3  # number of sentences to 'stride' over, used to create overlap

  video_info=video['video']
  transcript_segments=video['transcript']['segments']
  for i in tqdm(range(0, len(transcript_segments), stride)):
      i_end = min(len(transcript_segments)-1, i+window)
      text = ' '.join(transcript['text'] 
                    for transcript in
                    transcript_segments[i:i_end])
      # TODO: Should int (float to seconds) conversion happen at the API level?
      start=int(transcript_segments[i]['start'])
      end=int(transcript_segments[i]['end'])
      new_transcript_segments.append({
          **video_info,
          **{
          'start': start,
          'end': end,
          'title': video_info['title'],
          'text': text,
          'id': f"{video_info['id']}-t{start}",
          'url': f"https://youtu.be/{video_info['id']}?t={start}",
          }
      })
  return new_transcript_segments
combined_transcripts = combine_transcripts(video_data)

  0%|          | 0/31 [00:00<?, ?it/s]

In [75]:
combined_transcripts[3]

{'id': 'bGk8qcHc1A0-t25',
 'thumbnail': 'https://i.ytimg.com/vi/bGk8qcHc1A0/sddefault.jpg',
 'title': 'Joe Rogan & Lex Fridman: Lionel Messi Is The GOAT Over Cristiano Ronaldo',
 'views': 186879,
 'length': 218,
 'url': 'https://youtu.be/bGk8qcHc1A0?t=25',
 'start': 25,
 'end': 28,
 'text': " But what is the division?  They're just both incredible.  But what is it like who's better LeBron or Michael Jordan?  Is it like that kind of thing?  People get very passionate about that.  They extremely passionate."}

# Convert Transcripts to Vectors

1. Use Sentence Tranformers



In [None]:
!pip install -U sentence-transformers

## Save Transcript Vectors to Vector Database



In [None]:
!pip install pinecone-client

In [70]:
from getpass import getpass

PINECONE_API_KEY = getpass('Enter PINECONE_API_KEY')

Enter PINECONE_API_KEY··········


In [71]:
from sentence_transformers import SentenceTransformer

model_id = "multi-qa-mpnet-base-dot-v1"

sentence_transformer_model = SentenceTransformer(model_id)
sentence_transformer_model

dimensions = sentence_transformer_model.get_sentence_embedding_dimension()

In [72]:
import pinecone  # !pip install pinecone-client

index_id = "youtube-search"

pinecone.init(
    api_key=PINECONE_API_KEY,  # app.pinecone.io
    environment="us-west1-gcp"
)

if index_id not in pinecone.list_indexes():
    pinecone.create_index(
        index_id,
        dimensions,
        metric="dotproduct"
    )

index = pinecone.Index(index_id)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Upload to Vector Database

In [78]:
# we encode and insert in batches of 64
batch_size = 64

# loop through in batches of 64
for i in tqdm(range(0, len(combined_transcripts), batch_size)):
    # find end position of batch (for when we hit end of data)
    i_end = min(len(combined_transcripts)-1, i+batch_size)
    # extract the metadata like text, start/end positions, etc
    batch_meta = [{
        **combined_transcripts[x]
    } for x in range(i, i_end)]
    # extract only text to be encoded by embedding model
    batch_text = [
        row['text'] for row in combined_transcripts[i:i_end]
    ]
    # create the embedding vectors
    batch_embeds = sentence_transformer_model.encode(batch_text).tolist()
    # extract IDs to be attached to each embedding and metadata
    batch_ids = [
        row['id'] for row in combined_transcripts[i:i_end]
    ]
    # 'upsert' (insert) IDs, embeddings, and metadata to index
    to_upsert = list(zip(
        batch_ids, batch_embeds, batch_meta
    ))
    index.upsert(to_upsert)

  0%|          | 0/1 [00:00<?, ?it/s]

# Make Queries

In [79]:
query = "basketball"

xq = sentence_transformer_model.encode(query).tolist()

index.query(xq, top_k=5, include_metadata=True)

{'matches': [{'id': 'bGk8qcHc1A0-t25',
              'metadata': {'end': 28.0,
                           'id': 'bGk8qcHc1A0-t25',
                           'length': 218.0,
                           'start': 25.0,
                           'text': " But what is the division?  They're just "
                                   'both incredible.  But what is it like '
                                   "who's better LeBron or Michael Jordan?  Is "
                                   'it like that kind of thing?  People get '
                                   'very passionate about that.  They '
                                   'extremely passionate.',
                           'thumbnail': 'https://i.ytimg.com/vi/bGk8qcHc1A0/sddefault.jpg',
                           'title': 'Joe Rogan & Lex Fridman: Lionel Messi Is '
                                    'The GOAT Over Cristiano Ronaldo',
                           'url': 'https://youtu.be/bGk8qcHc1A0?t=25',
                       