In [30]:
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from dotenv import find_dotenv, load_dotenv
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import textwrap

In [12]:
# Load environment variables from a local file
load_dotenv(find_dotenv('../KEYS.env'))

# Create an instance of the OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Define the URL of the YouTube video
video_url = "https://www.youtube.com/watch?v=L_Guz73e6fw"

In [13]:
# --------------------------------------------------------------
# Load the YT video
# --------------------------------------------------------------

# This loader is used to retrieve the transcript from a given YouTube video URL, 
# preparing it for further processing.
loader = YoutubeLoader.from_youtube_url(video_url)

loader

<langchain.document_loaders.youtube.YoutubeLoader at 0x1db07c0ca60>

In [16]:
# --------------------------------------------------------------
# Creating a Document from YouTube Video Transcripts
# --------------------------------------------------------------

# Load the transcript of the video
transcript = loader.load()

# Display the first 200 characters of the page content
transcript[0].page_content[:200]

1

In [23]:
# We are unable to utilize the complete transcript containing all 
# characters when interfacing with the API, as its length is excessively 
# extensive.

# According to the API documentation, it is explicitly indicated that the 
# highest allowable count of tokens stands at 4096.

len(transcript[0].page_content)

129690

In [26]:
# --------------------------------------------------------------
# Splitting the transcript in several chunks
# --------------------------------------------------------------

# Create a text splitter to divide the transcript into smaller chunks
# Each chunk will have a size of 1000 characters and an overlap of 100 characters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap=100)

# Split the transcript into smaller documents using the text splitter
docs = text_splitter.split_documents(transcript)

# Number of generated documents after splitting
len(docs[0].page_content)

959

#### Convertendo os fragmentos recentemente criados em vetores


Convertendo os fragmentos recentemente criados em vetores, que são representações numéricas do próprio texto.

Para isso, utilizaremos a biblioteca denominada Facebook AI Similarity Search (`FAISS`), desenvolvida para buscar documentos multimídia que apresentam semelhanças entre si (busca por similaridade). Ou seja, realizaremos uma busca por similaridade para encontrar os trechos mais parecidos com a solicitação feita pelo usuáriov via prompt.

---

#### Converting the Recently Created Splits into Vectors

Converting the recently generated splits into vectors, which are numerical representations of the text itself.

To accomplish this, we will employ the library known as Facebook AI Similarity Search (FAISS), developed to search for multimedia documents that exhibit similarities among themselves (similarity search). In other words, we will conduct a similarity search to locate the segments most akin to the user's inquiry via the prompt.

![faiss](../img/faiss.JPG)

In [33]:
# Create a document database using FAISS from the split documents
db = FAISS.from_documents(docs, embeddings)

db

<langchain.vectorstores.faiss.FAISS at 0x1db0762fd90>

In [51]:
# --------------------------------------------------------------
# Combining Everything into a Function 
# --------------------------------------------------------------

def create_db_from_youtube_video_url(video_url):
    """
    Creates a document database from a YouTube video URL.

    Args:
        video_url (str): The URL of the YouTube video.

    Returns:
        db: A document database created from the video transcript.
    """
    # Create a loader for the YouTube video transcript
    loader = YoutubeLoader.from_youtube_url(video_url)

    # Load the transcript of the video
    transcript = loader.load()

    # Create a text splitter to divide the transcript into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

    # Split the transcript into smaller documents
    docs = text_splitter.split_documents(transcript)

    # Create a document database using FAISS and embeddings
    db = FAISS.from_documents(docs, embeddings)
    
    return db

In [42]:
# --------------------------------------------------------------
# Retrieve a response using the provided query from documents
# --------------------------------------------------------------

query = "What are they saying about Microsoft?"

# Search for similar documents in the database based on the query
docs = db.similarity_search(query, 
                            k=4) # The number of similar documents to retrieve

# The similar documents
docs[0]

Document(page_content="amazing folks I've ever met. - It takes a lot of time. Like, I spend, I mean, I think a lot of people claim to spend a third of their time hiring. I, for real, truly do. I still approve every\nsingle hire at OpenAI. And I think there's, you know,\nwe're working on a problem that is like very cool and that\ngreat people wanna work on. We have great people and some\npeople wanna be around them. But, even with that, I think\nthere's just no shortcut for putting a ton of effort into this. - So, even when you have the\ngood people, it's hard work. - I think so. - Microsoft announced the\nnew multi-year multi-billion dollar reported to be 10\nbillion investment into OpenAI. Can you describe the\nthinking that went into this? What are the pros, what are the cons of working with a company like Microsoft? - It's not all perfect or\neasy but, on the whole, they have been an amazing partner to us. Satya and Kevin McHale\nare super aligned with us, super flexible, have gone"

In [41]:
# Combine the page content of similar documents into a single string
docs_page_content = " ".join([d.page_content for d in docs])

docs_page_content

"amazing folks I've ever met. - It takes a lot of time. Like, I spend, I mean, I think a lot of people claim to spend a third of their time hiring. I, for real, truly do. I still approve every\nsingle hire at OpenAI. And I think there's, you know,\nwe're working on a problem that is like very cool and that\ngreat people wanna work on. We have great people and some\npeople wanna be around them. But, even with that, I think\nthere's just no shortcut for putting a ton of effort into this. - So, even when you have the\ngood people, it's hard work. - I think so. - Microsoft announced the\nnew multi-year multi-billion dollar reported to be 10\nbillion investment into OpenAI. Can you describe the\nthinking that went into this? What are the pros, what are the cons of working with a company like Microsoft? - It's not all perfect or\neasy but, on the whole, they have been an amazing partner to us. Satya and Kevin McHale\nare super aligned with us, super flexible, have gone and correct calls. And

In [44]:
# Create an instance of ChatOpenAI with settings for generating a response
chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

# Define a template for the system message prompt
template = """
        You are a helpful assistant that can answer questions about YouTube videos 
        based on the video's transcript: {docs}
        
        Only use factual information from the transcript to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed.
        """

In [46]:
# Create a system message prompt template from the defined template
system_message_prompt = SystemMessagePromptTemplate.from_template(template)

# Define a template for the human question prompt
human_template = "Answer the following question: {question}"

human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

# Create a chat prompt template from system and human message prompts
chat_prompt = ChatPromptTemplate.from_messages(
    [system_message_prompt, human_message_prompt]
)

In [49]:
# Create an LLMChain instance for the conversation with the model
chain = LLMChain(llm=chat, prompt=chat_prompt)

# Run the chain to generate a response based on the query and documents
response = chain.run(question=query, docs=docs_page_content)

# Replace line breaks in the response
response = response.replace("\n", "")

response

'In the video, the speaker is discussing their partnership with Microsoft and their experience working with the company. They mention that Microsoft has been an amazing partner to them and that the CEO, Satya Nadella, and Kevin McHale have been super aligned with their goals. They describe Microsoft as flexible and willing to go above and beyond to support their needs. The speaker also praises Satya Nadella for being a visionary leader who can effectively manage and make correct decisions. They mention that Microsoft understood the unique control provisions and the importance of AGI (Artificial General Intelligence) specialness that their project requires, which other companies may not have understood. Overall, the speaker has a positive view of Microsoft and their partnership, highlighting their alignment, flexibility, and effectiveness as a large-scale company.'

In [50]:
# --------------------------------------------------------------
# Combining Everything into a Function 
# --------------------------------------------------------------

def get_response_from_query(db, query, k=4):
    """
    Retrieve a response using the given query from a document database.

    Args:
        db (FAISS): A document database to search for responses.
        query (str): The query to search for in the database.
        k (int): The number of similar documents to retrieve.

    Returns:
        str: The response generated based on the query.
        List[Document]: The list of similar documents from the database.
    """
    
    # Search for similar documents in the database based on the query
    docs = db.similarity_search(query, k=k)
    
    # Combine the page content of similar documents into a single string
    docs_page_content = " ".join([d.page_content for d in docs])

    # Create an instance of ChatOpenAI with settings for generating a response
    chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

    # Define a template for the system message prompt
    template = """
        You are a helpful assistant that can answer questions about YouTube videos 
        based on the video's transcript: {docs}
        
        Only use factual information from the transcript to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed.
        """

    # Create a system message prompt template from the defined template
    system_message_prompt = SystemMessagePromptTemplate.from_template(template)

    # Define a template for the human question prompt
    human_template = "Answer the following question: {question}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    # Create a chat prompt template from system and human message prompts
    chat_prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]
    )

    # Create an LLMChain instance for the conversation with the model
    chain = LLMChain(llm=chat, prompt=chat_prompt)

    # Run the chain to generate a response based on the query and documents
    response = chain.run(question=query, docs=docs_page_content)
    
    # Replace line breaks in the response
    response = response.replace("\n", "")
    
    return response, docs


In [57]:
# --------------------------------------------------------------
# Example usage
# --------------------------------------------------------------
# Define the URL of the YouTube video
video_url = "https://www.youtube.com/watch?v=L_Guz73e6fw"

# Create a document database from the YouTube video transcript
db = create_db_from_youtube_video_url(video_url)

# Define a query to ask about Microsoft
query = "What are they saying about Microsoft?"

# Retrieve a response and similar documents from the database based on the query
response, docs = get_response_from_query(db, query)

# Print the response, formatting it to fit within 100 characters per line
print(textwrap.fill(response, width=100))


In the video transcript, the speaker mentions that Microsoft has been an amazing partner to OpenAI.
They highlight the alignment and flexibility between the two companies, with Satya Nadella (the CEO
of Microsoft) and Kevin McHale being mentioned as being super aligned with OpenAI. The speaker also
praises Satya Nadella for being a super effective hands-on executive and manager. They mention that
Microsoft has gone above and beyond to support OpenAI and have been willing to make the necessary
adjustments and investments to make their partnership successful. The speaker also mentions that
Microsoft is a large-scale, for-profit company and acknowledges that there may be pressure to make a
lot of money. However, they also mention that Microsoft understood the unique control provisions
that OpenAI needed for the development of AGI (Artificial General Intelligence), which sets them
apart from other companies at that scale. Overall, the speaker has a positive view of Microsoft and
their part