In [20]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

directory_path = (
    "data/"
)
loader = PyPDFDirectoryLoader(directory_path)

docs = loader.load()



In [21]:
len(docs)

64

In [22]:
print(docs[40].page_content[0:200])
print(docs[40].metadata)

[11] Richard S Sutton and Andrew G Barto. Reinforcement learning: An introduction . MIT press,
2018.
[12] Christopher Berner, Greg Brockman, Brooke Chan, Vicki Cheung, Przemysław D˛ ebiak, Christy
Den
{'source': 'data/vpt.pdf', 'page': 10}


In [23]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_community.chat_models import ChatOpenAI
openai = ChatOpenAI(model_name="gpt-4o-mini")

In [25]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [26]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(openai, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Tell me about how the VPT model is trained?"})

results

{'input': 'Tell me about how the VPT model is trained?',
 'context': [Document(metadata={'page': 3, 'source': 'data/vpt.pdf'}, page_content='Collecting “Clean” Data Training the VPT Foundation Model\nvia Behavioral Cloning\nTraining the Inverse Dynamics Model (IDM)~270k hours\nunlabeled\nvideo~70k hours\nunlabeled\nvideo\n~2k hours\nvideo\nlabeled with\nactionsFilter for “clean”\nvideo segmentsSearch for relevant\nMinecraft videos\nvia keywords\nContractors\ncollect data Label videos\nwith IDM ~70k hours\nvideo\nIDM-labeled\nwith actions\nTrain non-causal IDM\nTrain causal\nVPT Foundation Model\nadspace\nwadspace\nwFigure 2: Video Pretraining (VPT) Method Overview.\n3 Methods\nInverse Dynamics Models (IDM) VPT, illustrated in Figure 2, requires we ﬁrst collect a small\namount of labeled contractor data with which to train an inverse dynamics model pIDM(at|o1...T),\nwhich seeks to minimize the negative log-likelihood of an action at timestep tgiven a trajectory of T\nobservations ot:t∈[

In [None]:
import warnings
import logging
from pydantic import PydanticDeprecatedSince20
# Ignore specific warning from logger
warnings.filterwarnings("ignore", category=PydanticDeprecatedSince20)
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.chat_models import ChatOpenAI
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import getpass
import os

# get openAI key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# load the documents
directory_path = "data/"
loader = PyPDFDirectoryLoader(directory_path)
docs = loader.load()


# Docuemnt Chunking, Create Embedding,  Build vector-store
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-small"))
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})


# load OpenAI LLM API
llm_gpt4o_mini = ChatOpenAI(model_name="gpt-4o-mini")

# create the prompt template
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)




# build up the RAG pipeline
rag_chain = (
    {"context": retriever | format_docs, "input": RunnablePassthrough()}
    | prompt
    | openai
    | StrOutputParser()
)

for chunk in rag_chain.stream("What is steve and its relationship to the VPT model?"):
    print(chunk, end="", flush=True)


 ········
