## Develop a versatile Q&A chatbot, employing LlamaIndex, ASTRA DB (Apache Cassandra), and Gradient's open-source models like LLama2, Gradio, all designed for seamless interaction with YouTube videos

[**Link to my YouTube Channel**](https://www.youtube.com/BhaveshBhatt8791?sub_confirmation=1)

Click on the link below to open a Colab version of the notebook. You will be able to create your own version.

<a href="https://colab.research.google.com/github/bhattbhavesh91//youtube-q-a-gradient-astradb/blob/main/youtube-q-a-gradio-notebook.ipynb" target="_blank"><img height="40" alt="Run your own notebook in Colab" src = "https://colab.research.google.com/assets/colab-badge.svg"></a>

# Installation

In [None]:
!pip install -q gradio
!pip install -Uq yt-dlp
!pip install -Uq openai-whisper
!pip install -q cassandra-driver
!pip install -q cassio>=0.1.1
!pip install -q gradientai --upgrade
!pip install -q llama-index
!pip install -q pypdf
!pip install -q tiktoken==0.4.0

# Imports

In [None]:
import gradio as gr
import random
import os
import time
import re
import yt_dlp
import whisper
import os
import json
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import Cluster
from llama_index import ServiceContext
from llama_index import set_global_service_context
from llama_index import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.embeddings import GradientEmbedding
from llama_index.llms import GradientBaseModelLLM
from llama_index.vector_stores import CassandraVectorStore

# Environment Variables

In [None]:
os.environ['GRADIENT_ACCESS_TOKEN'] = "Enter your GRADIENT ACCESS TOKEN"
os.environ['GRADIENT_WORKSPACE_ID'] = "Enter your GRADIENT WORKSPACE ID"

# Download Audio from YouTube video function

In [None]:
def download_audio(link):
    with yt_dlp.YoutubeDL({'extract_audio': True,
                           'format': 'bestaudio',
                           'outtmpl': '%(title)s.mp3'}) as video:
        info_dict = video.extract_info(link, download = True)
        video_title = info_dict['title']
        video.download(link)
    return video_title

# Transcribe Audio from mp3 file

In [None]:
def transcribe(model, audio):
    result = model.transcribe(audio)
    with open("text_files/transcription.txt", 'w') as f:
        f.write(result["text"])
    return 1

# Setup the folder and load the Whisper Model

In [None]:
os.makedirs("text_files")

In [None]:
model = whisper.load_model("small")

# Setup the DataStax Vector DB Connection

In [None]:
# This secure connect bundle is autogenerated when you donwload your SCB,
# if yours is different update the file name below
cloud_config= {
  'secure_connect_bundle': 'secure-connect-bhavesh-astra-test.zip'
}

# This token json file is autogenerated when you donwload your token,
# if yours is different update the file name below
with open("bhavesh_astra_test-token.json") as f:
    secrets = json.load(f)

CLIENT_ID = secrets["clientId"]
CLIENT_SECRET = secrets["secret"]

auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()

row = session.execute("select release_version from system.local").one()
if row:
  print(row[0])
else:
  print("An error occurred.")

# Define the Gradient's Model Adapter for LLAMA-2

In [None]:
llm = GradientBaseModelLLM(
    base_model_slug = "llama2-7b-chat",
    max_tokens = 400,
)

# Configure Gradient embeddings

In [None]:
embed_model = GradientEmbedding(
    gradient_access_token = os.environ["GRADIENT_ACCESS_TOKEN"],
    gradient_workspace_id = os.environ["GRADIENT_WORKSPACE_ID"],
    gradient_model_slug ="bge-large",
)

# Setup LLAMA Index Service Context

In [None]:
service_context = ServiceContext.from_defaults(
    llm = llm,
    embed_model = embed_model,
    chunk_size = 256,
)

set_global_service_context(service_context)

# Define the ChatBot Interface

In [None]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        pattern = r'^(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/|youtube\.com/user/[^/]+/u/[^/]+/|youtube\.com/attribution_link\?a=|youtube\.com/attribution_user\?u=)([a-zA-Z0-9_-]{11})'
        pattern_match = re.match(pattern, message)
        if pattern_match:
            vid_title = download_audio(message)
            file_name_audio = str(vid_title) + ".mp3"
            transcribe_text = transcribe(model, file_name_audio)
            bot_message = "Transcribed the audio for the following YouTube video {}".format(vid_title)
        else:
            doc_path = "/content/text_files/"
            _, _, files = next(os.walk(doc_path))
            file_count = len(files)
            if file_count > 0:
                documents = SimpleDirectoryReader(doc_path).load_data()
                index = VectorStoreIndex.from_documents(documents,
                                                        service_context=service_context)
                query_engine = index.as_query_engine()
                response_1 = query_engine.query(message)
                bot_message = response_1.response
            else:
                bot_message = "No data Found, please add a YouTube video"

        chat_history.append((message, bot_message))
        time.sleep(1)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

demo.launch()