## **Install required dependencies to run this notebook**

In [None]:
#Make following parameter 1 if you have restarted using this colab notebook
#after a restart of kernel or re-logging into gmail account
running_after_kernel_restart=1
if(running_after_kernel_restart==1):
  ! pip install gradio
  ! pip install langchain
  ! pip install dotenv
  ! pip install
  ! pip install sentence_transformers
  ! pip install chromadb
  !pip install unstructured
  ! pip install youtube-transcript-api
  ! pip install faiss
  ! pip install faiss-cpu


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.34.0-py3-none-any.whl (20.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles (from gradio)
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Collecting aiohttp (from gradio)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi (from gradio)
  Downloading fastapi-0.96.1-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client>

# **Model Development**

In [None]:
#import openai
#from langchain.chat_models import ChatOpenAI
import os
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from dotenv import find_dotenv, load_dotenv
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import textwrap
chunk_size = 1000
#importing the .env file containing the api key
load_dotenv(find_dotenv())

# Different Large Language Models
model_name = "sentence-transformers/all-mpnet-base-v2"
model_name1 = "sentence-transformers/LaBSE"
model_name= 'intfloat/e5-large-v2'
model_name = 'all-MiniLM-L6-v2'
model_name="google/flan-t5-base"
model_name="google/flan-t5-xl"
model_name="google/flan-t5-large"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
            model_name=model_name1,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
        )
embeddings = HuggingFaceEmbeddings(model_kwargs=model_kwargs)

#establishing the key
#api_key = os.environ['OPENAI_API_KEY']


#creating a database
def creating_db(video_url):

    loader= YoutubeLoader.from_youtube_url(video_url)
    transcript= loader.load()

    #to breakdown the enormous amount of tokens we will get from the transcript as we have a limited set we can input
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

    #this is just a list with the bunch of splits from the above
    docs= text_splitter.split_documents(transcript)
    #print(len(docs))

    #the final database
    '''
    when a user asks a question, this database will be used to perform the similarity search and
    generate output based on that
    '''
    if(len(docs)>0):
      db= FAISS.from_documents(docs, embeddings) #embeddings are the vectors we convert the text over into
    else:
      docs = "NA"
      db= FAISS.from_documents(docs, embeddings)
    return db



#creating another function to get response from querying the above database
def get_response(db, query, k=10):

    '''
    gpt can handle up to 4097 tokens. Setting the chunksize to 1000 and k to 4 maximizes
    the number of tokens to analyze.
    '''

    docs= db.similarity_search(query, k=k)

    #joining them into one single string
    docs_page_content = " ".join([d.page_content for d in docs])

    #chat= ChatOpenAI(temperature=0.4)
    llm =  HuggingFacePipeline.from_model_id(model_id=model_name, task="text2text-generation", model_kwargs={"temperature":3e-1, "max_length" : chunk_size})


    #template for the system message prompt

    template= '''
              You are a helpful assistant who can answer question from Youtube videos based on the video's transcript: {docs}
              Only use the factual information from transcript to answer the question.
              If you feel like you don't have enough information to answer the question, say: "Sorry, I cannot answer that".
              Your answer should be verbose and detailed.
              '''

    system_message_prompt= SystemMessagePromptTemplate.from_template(template)

    #Human question prompt

    human_template= 'Answer the following question: {question}'

    human_message_prompt= HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt= ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]

    )


    #chaining

    #chain= LLMChain(llm=llm, prompt=chat_prompt)
    retriever = db.as_retriever(search_type='similarity', search_kwargs={"k": 10} )
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

    #response= chain.run(question=query, docs= docs_page_content)
    #response = response.replace("\n", "")
    res = qa(query)
    return res, docs









**Initiating Chat with Youtube over prompt**

In [None]:
#calling the functions:

#User prompt URL
video_url= input('Please enter the url: ')

#Hardcoded URL
video_url ='https://www.youtube.com/watch?v=mHWYqJpRoHI'

db= creating_db(video_url)

#Hardcoded Question
query= 'Which topics are they talking in this youtube video' #'https://www.youtube.com/watch?v=V0dahVRNFDY'

#User prompt question
query= input('Please enter your question :: ')
response, docs = get_response(db, query, k=10)
print("Answer from the Flan-T5-Large model :: ")
print(response['result'])

Please enter the url: https://www.youtube.com/watch?v=-7HKIrGWPxo
Please enter your question :: Which topics are they talking in this youtube video


Token indices sequence length is longer than the specified maximum sequence length for this model (2118 > 512). Running this sequence through the model will result in indexing errors


Answer from the Flan-T5-Large model :: 
inventory management


**Check the length of docs created post splitting**

In [None]:
    # Check the length of docs created post splitting

    from langchain.document_loaders import YoutubeLoader
    video_url='https://www.youtube.com/watch?v=mHWYqJpRoHI'
    video_url='https://www.youtube.com/watch?v=kLzU6wa0YYU'
    loader= YoutubeLoader.from_youtube_url(video_url)
    transcript= loader.load()

    #to breakdown the enormous amount of tokens we will get from the transcript as we have a limited set we can input
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

    #this is just a list with the bunch of splits from the above
    docs= text_splitter.split_documents(transcript)
    print("Number of documents created post splitting::")
    print(docs)

[Document(page_content="[Music] foreign [Music] management science and I would like everyone to know that management science is the most prestigious journal in operations management so congratulations oh thank you very much bhagwan what is the title of your paper uh it's titled a model for integrated assortment and inventory planning and it's with a co-author Victor Martin is the albinist at the EAC Business School so Sumit tell me a little bit about how you came to this problem why did you decide to work on this so it's very interesting we were looking at a data for different retail stores and what we see is that the assortments that are made available in these stores are not the same right we have large retail stores where we see where they get the full assortment a smaller retail stores they do not get the full assortment they just get an assortment of say the more popular product right and so then we were wondering you know why is this what is driving this yeah and the obvious answ

# **Questions asked to the model**

In [None]:
query= 'Which topic are they talking in video'
response, docs = get_response(db, query, k=10)
print("Content of the Youtube transcript::")
print(response['source_documents'][0].page_content)
print(response['result'])

Token indices sequence length is longer than the specified maximum sequence length for this model (2115 > 512). Running this sequence through the model will result in indexing errors


Content of the Youtube transcript::
here we are pushing the frontier a little bit and look at the case where you have an assortment of products right so so which is actually pretty important because usually if I'm buying let's say yogurt I'm not just interested in one I could buy Denon or I could buy something else so you're saying you could offer me different types of products which are substitutes of each other absolutely so that's the innovation in your paper now you're studying not just once product but many such products right so how do we first of all understand what products to stock about products to stock so that is the assortment question and then on top of that how much of each of these products to stock right which is inventory question right so now it's a complicated problem because uh on top of having different types of different assortment of products which are sort of substitutes the margins you get on each one of these products as a seller might be different absolutely

In [None]:
query= 'Who all are discussing in this youtube video'
response, docs = get_response(db, query, k=10)
print(response['result'])


Token indices sequence length is longer than the specified maximum sequence length for this model (2117 > 512). Running this sequence through the model will result in indexing errors


Sumit


In [None]:
query= 'Which Journal are they talking about in this youtube video' #'https://www.youtube.com/watch?v=V0dahVRNFDY' #input('Please enter your question: ')
response, docs = get_response(db, query, k=10)
print(response['result'])

Token indices sequence length is longer than the specified maximum sequence length for this model (2121 > 512). Running this sequence through the model will result in indexing errors


Management Science


In [None]:
query= 'Summarize things being discussed in this youtube video' #'https://www.youtube.com/watch?v=V0dahVRNFDY' #input('Please enter your question: ')
response, docs = get_response(db, query, k=10)
print(response['result'])
summary= str(response['result'])

Token indices sequence length is longer than the specified maximum sequence length for this model (2125 > 512). Running this sequence through the model will result in indexing errors


Sumit's paper titled a model for integrated assortment and inventory planning explains how to solve the inventory and assortment problems.


In [None]:
query= 'What is the novelty of things being discussed in this youtube video' #'https://www.youtube.com/watch?v=V0dahVRNFDY' #input('Please enter your question: ')
response, docs = get_response(db, query, k=10)
print(response['result'])

Token indices sequence length is longer than the specified maximum sequence length for this model (2110 > 512). Running this sequence through the model will result in indexing errors


you're studying not just once product but many such products


In [None]:
query= 'What problems are  being discussed in this youtube video' #'https://www.youtube.com/watch?v=V0dahVRNFDY' #input('Please enter your question: ')
response, docs = get_response(db, query, k=10)
problem= response['result']
print(response['result'])

Token indices sequence length is longer than the specified maximum sequence length for this model (2118 > 512). Running this sequence through the model will result in indexing errors


inventory management


In [None]:
import docx

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd 'MyDrive/source_documents'
from docx.shared import Inches
%pwd
%ls
# Create an instance of a word document
doc = docx.Document()

# Add a Title to the document
doc.add_heading('Transcript with Summary', 0)

# Adding paragraph with spacing
doc.add_heading('Transcript:', 3)
transcript= str(response['source_documents'][0].page_content)
para = doc.add_paragraph(transcript)
print(para)
# Adding linspace of 0.5 inches in the paragraph
para.paragraph_format.line_spacing = Inches(0.5)

# Adding paragraph without spacing
doc.add_heading('Summary:', 2.5)
doc.add_paragraph(summary)

doc.add_heading('Problem discussed:', 2)
doc.add_paragraph(problem)
%pwd
# Now save the document to a location
doc.save('transcript.docx')
%ls

Mounted at /content/drive
[Errno 107] Transport endpoint is not connected: 'MyDrive/source_documents'
/content/drive/My Drive


UsageError: CWD no longer exists - please use %cd to change directory.
