<a href="https://colab.research.google.com/github/amrindersingh03/Unstructured-Machine-Learning-/blob/main/Langchain_transcription_and_Semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook will transcribe a youtube video using langchain transcription , and then perform semantic search on the transcription.

In [1]:
# Make sure you are connected to a GPU runtime

### Install pytube: Library to download audios

In [2]:
pip install pytube # For audio downloading

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytube
  Downloading pytube-12.1.2-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-12.1.2


In [None]:
# Get whisper. Whisper is a speech recognition and translation model from open AI.

In [24]:
pip install git+https://github.com/openai/whisper.git -q # Whisper from OpenAI transcription model

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import whisper 
import pytube 

### Take any youtube video that you wish to transcribe. Here, I have taken video of steve jos speech.

In [5]:
url = "https://www.youtube.com/watch?v=Tuw8hxrFBH8"
video = pytube.YouTube(url) # We now have access of that video stored in variable named " video "

In [6]:
audio = video.streams.get_audio_only() # Extracting audio from the video
audio.download(filename='tmp.mp3') # Downlods only audio from youtube video

'/content/tmp.mp3'

In [25]:
# Load the whisper model

In [7]:
model = whisper.load_model("small")

100%|███████████████████████████████████████| 461M/461M [00:07<00:00, 67.1MiB/s]


In [None]:
# Performing transcriptin on audio using whisper model " model "

In [8]:
transcription = model.transcribe('/content/tmp.mp3') # Here, transcription data is stored in the variable named " transcription "

### Let's visualize how this transcription looks like

In [26]:
transcription

{'text': " Today, I want to tell you three stories from my life. That's it. No big deal. Just three stories. The first story is about connecting the dots. I dropped out of Reed College after the first six months, but then stayed around as a drop-in for another 18 months or so before I really quit. So why'd I drop out? It started before I was born. My biological mother was a young unwed graduate student, and she decided to put me up for adoption. She felt very strongly that I should be adopted by college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife. Except that when I popped out, they decided at the last minute that they really wanted a girl. So my parents, who were on a waiting list, got a call in the middle of the night asking, we've got an unexpected baby boy. Do you want him? They said, of course. My biological mother found out later that my mother had never graduated from college and that my father had never graduated from high school.

In [None]:
# We saw that transcription is in a dictionary form

In [9]:
res = transcription['segments'] # We have grabbed only "segment" from dictionary  " transcription "

In [27]:
res

[{'id': 0,
  'seek': 0,
  'start': 0.0,
  'end': 3.2800000000000002,
  'text': ' Today, I want to tell you three stories from my life.',
  'tokens': [2692, 11, 286, 528, 281, 980, 291, 1045, 3676, 490, 452, 993, 13],
  'temperature': 0.0,
  'avg_logprob': -0.16006608850815718,
  'compression_ratio': 1.4879227053140096,
  'no_speech_prob': 0.023234836757183075},
 {'id': 1,
  'seek': 0,
  'start': 3.2800000000000002,
  'end': 6.24,
  'text': " That's it. No big deal. Just three stories.",
  'tokens': [663, 311, 309, 13, 883, 955, 2028, 13, 1449, 1045, 3676, 13],
  'temperature': 0.0,
  'avg_logprob': -0.16006608850815718,
  'compression_ratio': 1.4879227053140096,
  'no_speech_prob': 0.023234836757183075},
 {'id': 2,
  'seek': 0,
  'start': 10.16,
  'end': 13.120000000000001,
  'text': ' The first story is about connecting the dots.',
  'tokens': [440, 700, 1657, 307, 466, 11015, 264, 15026, 13],
  'temperature': 0.0,
  'avg_logprob': -0.16006608850815718,
  'compression_ratio': 1.487922

In [None]:
# Let's try to arrange data in more organised and readable manner.

In [10]:
from datetime import datetime

def store_segments(segments):
  texts = []
  start_times = []

  for segment in segments:
    text = segment['text']
    start = segment['start']

    # Convert the starting time to a datetime object
    start_datetime = datetime.fromtimestamp(start)

    # Format the starting time as a string in the format "00:00:00"
    formatted_start_time = start_datetime.strftime('%H:%M:%S')

    texts.append("".join(text))
    start_times.append(formatted_start_time)

  return texts, start_times

In [11]:
store_segments(res)

([' Today, I want to tell you three stories from my life.',
  " That's it. No big deal. Just three stories.",
  ' The first story is about connecting the dots.',
  ' I dropped out of Reed College after the first six months, but then stayed around as a drop-in',
  " for another 18 months or so before I really quit. So why'd I drop out?",
  ' It started before I was born. My biological mother was a young unwed graduate student,',
  ' and she decided to put me up for adoption. She felt very strongly that I should be adopted by',
  ' college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife.',
  ' Except that when I popped out, they decided at the last minute that they really wanted a girl.',
  ' So my parents, who were on a waiting list, got a call in the middle of the night asking,',
  " we've got an unexpected baby boy. Do you want him? They said, of course.",
  ' My biological mother found out later that my mother had never graduated from colle

In [12]:
texts, start_times = store_segments(res)

In [None]:
# Install langchain, to perform semantic search

In [13]:
pip install langchain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langchain
  Downloading langchain-0.0.86-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.2/250.2 KB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting marshmallow-enum<2.0.0,>=1.5.1
  Downloading marshmallow_enum-1.5.1-py2.py3-none-any.whl (4.2 kB)
Collecting typing-inspect>=0.4.0
  Downloading typing_inspect-0.8.0-py3-none-any.whl (8.7 kB)
Collecting mypy-extensions>=0.3.0
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, typing-inspect, marshmallow-enum, dataclasses-json, langchain
Successfully installed dataclasses-json-0.5.7 langchain-0.0.86 marshmallow-enum-1.5.1 mypy-extensions-1.0.0 typing-inspect-0.8.0


In [14]:
pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.26.5.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai
  Building wheel for openai (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.26.5-py3-none-any.whl size=67620 sha256=9c113343e472f27ca090e744ff769e1d478c170cb61a0b64ac89869d1ab3a11c
  Stored in directory: /root/.cache/pip/wheels/a7/47/99/8273a59fbd59c303e8ff175416d5c1c9c03a2e83ebf7525a99
Successfully built openai
Installing collected packages: openai
Successfully installed openai-0.26.5


In [17]:
pip install --upgrade faiss-gpu==1.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu==1.7.1
  Downloading faiss_gpu-1.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (89.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.1


In [19]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.chains import VectorDBQAWithSourcesChain
from langchain import OpenAI
import openai
import faiss

In [20]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [21]:
text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(texts):
    splits = text_splitter.split_text(d)
    docs.extend(splits)
    metadatas.extend([{"source": start_times[i]}] * len(splits))
embeddings = OpenAIEmbeddings()

In [None]:
# !apt install libomp-dev
# !python -m pip install --upgrade faiss faiss-gpu
# import faiss

In [22]:
store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
faiss.write_index(store.index, "docs.index")

RateLimitError: ignored

In [20]:
chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=store)

NameError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
result = chain({"question": "How old was Steve Jobs when started Apple?"})

In [None]:
print(f"Answer: {result['answer']}  Sources: {result['sources']}")

Answer:  Steve Jobs was 20 when he started Apple.  Sources: 00:05:47, 00:05:59
