In [6]:
import openai
import os
from dotenv import load_dotenv

from langchain.chat_models import ChatOpenAI
# from langchain.callbacks.base import CallbackManager
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory

# from transcript_search import search_transcript
import pinecone

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX = os.getenv("PINECONE_INDEX")
PINECONE_ENV = os.getenv("PINECONE_ENV")

openai.api_key = OPENAI_API_KEY
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV,  # next to api key in console
    )

openai.api_key = OPENAI_API_KEY
embed = OpenAIEmbeddings(
        model='text-embedding-ada-002',
        openai_api_key=OPENAI_API_KEY
    )

In [15]:
from youtube_transcript_api import YouTubeTranscriptApi
import urllib
from youtube_transcript_api.formatters import TextFormatter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def get_video_from_url(video_url):

    parsed_url = urllib.parse.urlparse(video_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)
    video_id = query_params["v"][0]

    return YouTubeTranscriptApi.get_transcript(video_id)

def preprocess_transcript(transcript):
    
    formatter = TextFormatter()
    formatted_transcript = formatter.format_transcript(transcript).replace("\n", " ")

    return formatted_transcript


# chunk by an arbitrary chunk size - a potential improvement is using spacy or NLTK as the splitter
def chunk_by_text(text, chunk_size = 500, chunk_overlap = 20):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap
    )

    docs = []  # List holding all the documents

    for i,chunk in enumerate(text_splitter.split_text(text)):
        # Generate documents
        string_index = i * (chunk_size - chunk_overlap)

        docs.append(Document(
            page_content=chunk
            )) 

    return docs

url = "https://www.youtube.com/watch?v=jSP-gSEyVeI"
ts = get_video_from_url(url)
formatted_transcript = preprocess_transcript(ts)
docs = chunk_by_text(formatted_transcript)
docs[0]

Document(page_content="large language models are incredibly powerful as we've seen but they lack some of the abilities that even the dumbest computer programs can handle with ease logic calculations and search are just a few examples of where large language models fail and really dumb computer programs um can actually perform very well we've been using computers to solve incredibly complex calculations for a very long time yet if we ask gbt4 to tell us the answer to what is 4.1 multiplied by 7.9 it actually fails", metadata={})

In [20]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain


chat = ChatOpenAI()
docsearch = Pinecone.from_existing_index(PINECONE_INDEX, OpenAIEmbeddings())
# docsearch = FAISS.from_documents(docs, OpenAIEmbeddings)

qa = RetrievalQA.from_chain_type(chat, chain_type="stuff", retriever=docsearch.as_retriever())
query = "How do you intialize an agent in langchain"
qa.run(query)

# docsearch.similarity_search("How do I create an agent in langchain")


'To initialize an agent in Lang chain, you need to have three key components: a large language model or multiple large language models, a tool that you will be interacting with, and an agent to control the interaction. You can use agents with several other tools, and you can create your own agent. To initialize the language model, you first need to initialize your OpenAI LM, and then you can define the agent by giving it a name, a description of when it should be used, and the function it should run. Different types of agents can be used in Lang chain depending on the task you want to accomplish.'