In [7]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain

In [8]:
MODEL = 'llama3.2'
PDF_LINK = "pdf/mapreduce-osdi04.pdf"

In [9]:

llm = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)


In [10]:
loader = PyPDFLoader(PDF_LINK, extract_images=False)
pages = loader.load_and_split()
pages

[Document(metadata={'source': 'pdf/mapreduce-osdi04.pdf', 'page': 0}, page_content="MapReduce: Simpli\x02ed Data Processing onLargeClusters\nJeffreyDean andSanjay Ghema wat\njeff@google.com, sanjay@google.com\nGoogle,Inc.\nAbstract\nMapReduce isaprogramming model andanassoci-\nated implementation forprocessing andgenerating large\ndata sets. Users specify amap function thatprocesses a\nkey/valuepairtogenerate asetofintermediate key/value\npairs, andareduce function thatmergesallintermediate\nvalues associated with thesame intermediate key.Many\nrealworld tasks areexpressible inthismodel, asshown\ninthepaper .\nPrograms written inthisfunctional style areautomati-\ncally parallelized andexecuted onalargecluster ofcom-\nmodity machines. Therun-time system takescare ofthe\ndetails ofpartitioning theinput data, scheduling thepro-\ngram' sexecution across asetofmachines, handling ma-\nchine failures, andmanaging therequired inter-machine\ncommunication. This allowsprogrammers without any\nex

In [11]:
text_splitter = RecursiveCharacterTextSplitter()
chunks = text_splitter.split_documents(pages)
chunks

[Document(metadata={'source': 'pdf/mapreduce-osdi04.pdf', 'page': 0}, page_content="MapReduce: Simpli\x02ed Data Processing onLargeClusters\nJeffreyDean andSanjay Ghema wat\njeff@google.com, sanjay@google.com\nGoogle,Inc.\nAbstract\nMapReduce isaprogramming model andanassoci-\nated implementation forprocessing andgenerating large\ndata sets. Users specify amap function thatprocesses a\nkey/valuepairtogenerate asetofintermediate key/value\npairs, andareduce function thatmergesallintermediate\nvalues associated with thesame intermediate key.Many\nrealworld tasks areexpressible inthismodel, asshown\ninthepaper .\nPrograms written inthisfunctional style areautomati-\ncally parallelized andexecuted onalargecluster ofcom-\nmodity machines. Therun-time system takescare ofthe\ndetails ofpartitioning theinput data, scheduling thepro-\ngram' sexecution across asetofmachines, handling ma-\nchine failures, andmanaging therequired inter-machine\ncommunication. This allowsprogrammers without any\nex

In [12]:
db = Chroma.from_documents(chunks, embedding=embeddings, persist_directory="text_index")

In [13]:
db = Chroma(persist_directory="text_index", embedding_function=embeddings)

retriver = db.as_retriever(search_kwargs={"k":3})

  db = Chroma(persist_directory="text_index", embedding_function=embeddings)


In [14]:
chain = load_qa_chain(llm, chain_type="stuff")

stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff")


In [15]:
question = "Resuma o que é MapReduce"

In [16]:
context = retriver.invoke(question)
answer = chain.invoke({"input_documents": context, "question": question})
answer["output_text"]

'A MapReduce é uma biblioteca de processamento paralelo de grande escala desenvolvida pela Google. Ela permite dividir grandes conjuntos de dados em partes menores, processar cada parte em paraleto em um grande número de máquinas (computadores) e então combinar os resultados para obter a saída final.\n\nA MapReduce é composta por dois principais componentes:\n\n1. **Mapper**: Divide o dados em partes menores e processo cada parte em paraleto em uma ou mais máquinas.\n2. **Reducer**: Combina os resultados dos Mappers em paraleto e obtém a saída final.\n\nA biblioteca MapReduce é muito flexível e pode ser usada para realizar uma variedade de tarefas, como:\n\n* Procesamento de grandes conjuntos de dados\n* Análise de dados em larga escala\n* Pre processamento de dados\n* Agregação de dados\n\nA MapReduce é amplamente utilizada em muitas aplicações, incluindo sistemas de armazenamento de dados, sistemas de recomendção e sistemas de busca.\n\nEm resumo, a MapReduce é uma ferramenta poderos