In [74]:
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_chroma import Chroma
import os
import getpass

In [75]:
loader = PDFPlumberLoader("C:/Users/Emumba/Downloads/RAG PDF/dummyPDF.pdf")
data = loader.load()

In [35]:
len(data)

2

In [36]:
data[0].page_content

'NATIONAL PARTNERSHIP FOR QUALITY AFTERSCHOOL LEARNING\nwww.sedl.org/afterschool/toolkits\n����������� �������� �������\nTutoring to Enhance Science Skills\nTutoring Two: Learning to Make Data Tables\n..............................................................................................\nSample Data for Data Tables\nUse these data to create data tables following the Guidelines for Making a Data Table and\nChecklist for a Data Table.\nExample 1: Pet Survey (GR 2–3)\nMs. Hubert’s afterschool students took a survey of the 600 students at Morales Elementary\nSchool. Students were asked to select their favorite pet from a list of eight animals. Here\nare the results.\nLizard 25, Dog 250, Cat 115, Bird 50, Guinea pig 30, Hamster 45, Fish 75,\nFerret 10\nExample 2: Electromagnets—Increasing Coils (GR 3–5)\nThe following data were collected using an electromagnet with a 1.5 volt battery, a switch,\na piece of #20 insulated wire, and a nail. Three trials were run. Safety precautions in\

In [76]:
#Splitting Text 

text_splitter = RecursiveCharacterTextSplitter(chunk_size=218, chunk_overlap=20)
splits = text_splitter.split_documents(data)

In [77]:
len(splits)

26

In [39]:
splits[0]

Document(metadata={'source': 'C:/Users/Emumba/Downloads/RAG PDF/dummyPDF.pdf', 'file_path': 'C:/Users/Emumba/Downloads/RAG PDF/dummyPDF.pdf', 'page': 0, 'total_pages': 2, 'ModDate': "D:20060706163857-04'00'", 'CreationDate': 'D:20051108152002Z', 'Producer': 'Adobe PDF Library 5.0', 'Title': 'Sample Data for Data Tables', 'Creator': 'Adobe InDesign 2.0.2', 'Trapped': 'False'}, page_content='NATIONAL PARTNERSHIP FOR QUALITY AFTERSCHOOL LEARNING')

In [78]:
HF_TOKEN = "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"

In [79]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key = HF_TOKEN, model_name = "BAAI/bge-base-en-v1.5")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [80]:
#Retrival
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("What are the tricks for writing essays?")

len(retrieved_docs)

6

In [43]:
retrieved_docs[0]

Document(metadata={'CreationDate': 'D:20051108152002Z', 'Creator': 'Adobe InDesign 2.0.2', 'ModDate': "D:20060706163857-04'00'", 'Producer': 'Adobe PDF Library 5.0', 'Title': 'Sample Data for Data Tables', 'Trapped': 'False', 'file_path': 'C:/Users/Emumba/Downloads/RAG PDF/dummyPDF.pdf', 'page': 0, 'source': 'C:/Users/Emumba/Downloads/RAG PDF/dummyPDF.pdf', 'total_pages': 2}, page_content='Tutoring to Enhance Science Skills')

In [67]:
os.environ["GROQ_API_KEY"] = getpass.getpass()

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

 ········


In [81]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()

example_messages



[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: filler question \nContext: filler context \nAnswer:", additional_kwargs={}, response_metadata={})]

In [26]:
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: filler question 
Context: filler context 
Answer:


In [82]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

for chunk in rag_chain.stream("What is the main topic of this PDF"):
    print(chunk, end="", flush=True)

I don't have enough information to determine the main topic of the PDF. The provided context appears to be a mix of unrelated text, including a URL, copyright information, and a reference to "Guidelines for Making a Data Table".

# Chunck to 80

In [48]:
for chunk in rag_chain.stream("What is pet survey example"):
    print(chunk, end="", flush=True)

A pet survey example is a checklist for a data table where students are asked to select their favorite pet from a list of eight options. This is an example of a data table used in a grade 2-3 school setting.

In [49]:
for chunk in rag_chain.stream("What are those 8 animals?"):
    print(chunk, end="", flush=True)

The list of eight animals is: Lizard, Dog, Cat, Bird, Guinea pig, Hamster, Fish.

In [50]:
for chunk in rag_chain.stream("These are 7 animals not 8"):
    print(chunk, end="", flush=True)

These animals are Lizard, Dog, Cat, Bird, Guinea pig, Pig, Hamster, and Fish.

In [51]:
for chunk in rag_chain.stream("What are number of coils?"):
    print(chunk, end="", flush=True)

Based on the context, the number of coils is 3, 4, or 5.

In [52]:
for chunk in rag_chain.stream("What are the numbers of paperclips against number of coils 15?"):
    print(chunk, end="", flush=True)

According to the context, the numbers of paperclips against number of coils 15 do not exist in the provided data.

In [53]:
for chunk in rag_chain.stream("What are the numbers of paperclips on coils 15?"):
    print(chunk, end="", flush=True)

I don't have the information about the numbers of paperclips on coils 15.

In [55]:
for chunk in rag_chain.stream("What are the number of paperclips?"):
    print(chunk, end="", flush=True)

Based on the provided context, the number of paperclips is 3 for 5 coils, 7 for 10 coils, and 11 for 1 coil, among others.

In [56]:
for chunk in rag_chain.stream("What is the relation between number of paperclips and number of coils?"):
    print(chunk, end="", flush=True)

Based on the context, it appears that the relation between the number of paperclips and the number of coils is not explicitly stated. However, it can be observed that the number of paperclips increases with the number of coils.

In [58]:
for chunk in rag_chain.stream("if number of coils 15, what is the value of number of paperclips?"):
    print(chunk, end="", flush=True)

I don't have enough information to determine the value of the number of paperclips for 15 coils. The given context only provides data for 5 coils and 10 coils, but not for 15 coils.

In [59]:
for chunk in rag_chain.stream("if number of coils 5, what is the value of number of paperclips?"):
    print(chunk, end="", flush=True)

According to the given context, if the number of coils is 5, the corresponding values for number of paperclips are 3, 5, or 4.

In [60]:
for chunk in rag_chain.stream("if number of coils 10, what is the value of number of paperclips?"):
    print(chunk, end="", flush=True)

Based on the provided context, if the number of coils is 10, the value of the number of paperclips is 7, 8, or 6.

# After chnaging Chunck to 100

In [70]:
for chunk in rag_chain.stream("What is pet survey example"):
    print(chunk, end="", flush=True)

A pet survey example is a data collection method where students are asked to select their favorite pet from a list of eight animals.

In [71]:
for chunk in rag_chain.stream("What are those 8 animals?"):
    print(chunk, end="", flush=True)

The list of eight animals is: Lizard, Dog, Cat, Bird, Guinea pig, and possibly three more. The additional animals are not specified in the given context.

In [72]:
for chunk in rag_chain.stream("if number of coils 15, what is the value of number of paperclips?"):
    print(chunk, end="", flush=True)

Based on the context, if the number of coils is 15, the value of number of paperclips is 11, 10, or 12.

In [73]:
for chunk in rag_chain.stream("How many number of coils are there?"):
    print(chunk, end="", flush=True)

There are 3 sets of coils with varying numbers of paperclips: 5 coils with 3, 5, and 4 paperclips; 10 coils with 7, 8, and 6 paperclips; and 15 coils with 11, 10, and 12 paperclips.

# After chunck 218

In [83]:
for chunk in rag_chain.stream("What is pet survey example"):
    print(chunk, end="", flush=True)

The example of a pet survey is a data table showing the results of a survey taken by Ms. Hubert's afterschool students, where students were asked to select their favorite pet from a list of eight animals. The table displays the number of students who selected each pet, with a total of 600 students surveyed.

In [84]:
for chunk in rag_chain.stream("What are those 8 animals?"):
    print(chunk, end="", flush=True)

The 8 animals are: Lizard, Dog, Cat, Bird, Guinea pig, Hamster, Fish, and Ferret.

In [85]:
for chunk in rag_chain.stream("if number of coils 15, what is the value of number of paperclips?"):
    print(chunk, end="", flush=True)

According to the given context, if the number of coils is 15, the value of number of paperclips is 11, 10, or 12.

In [86]:
for chunk in rag_chain.stream("How many number of coils are there?"):
    print(chunk, end="", flush=True)

According to the given context, there are 5, 10, 15, and 20 number of coils.

In [87]:
for chunk in rag_chain.stream("if number of coils 20, what is the value of number of paperclips?"):
    print(chunk, end="", flush=True)

Based on the provided context, if the number of coils is 20, the value of the number of paperclips is 15, 13, or 14.

In [88]:
for chunk in rag_chain.stream("what car speed 407.447?"):
    print(chunk, end="", flush=True)

The car speed is 407.447 miles per hour.

In [89]:
for chunk in rag_chain.stream("what was the name of driver of car speed 407.447?"):
    print(chunk, end="", flush=True)

The driver of the car that achieved a speed of 407.447 mph was Craig Breedlove.

In [90]:
for chunk in rag_chain.stream("what was the name of Engine of car speed 407.447?"):
    print(chunk, end="", flush=True)

The Engine that achieved a car speed of 407.447 mph was the GE J47.

In [91]:
for chunk in rag_chain.stream("what was the name of Engine of car speed 763.035?"):
    print(chunk, end="", flush=True)

I don't know the answer. The provided context only lists speeds achieved by various drivers in different cars, but it doesn't mention a car with an engine speed of 763.035.

In [92]:
for chunk in rag_chain.stream("list down all speeds?"):
    print(chunk, end="", flush=True)

I don't have a comprehensive list of all speeds as the provided context only contains a few examples of land speed records. However, based on the given context, here are the speeds mentioned:

* 39.2 miles per hour

In [93]:
for chunk in rag_chain.stream("list down all speeds(mph)?"):
    print(chunk, end="", flush=True)

Based on the retrieved context, the speeds listed in miles per hour (mph) are:

39.2

In [94]:
for chunk in rag_chain.stream("list down all Automobile Land Speed Records?"):
    print(chunk, end="", flush=True)

Based on the provided context, the list of Automobile Land Speed Records is not explicitly stated. However, the source mentions that L'Automobile (FIA) recorded the following land speed records, but the actual records are not listed.

In [95]:
for chunk in rag_chain.stream("How many cars are using GE J79 Engine?"):
    print(chunk, end="", flush=True)

Based on the provided context, I can see that the "Sonic 1" is using the GE J79 engine, and there are two entries for Sonic 1: one on November 2, 1965, and another on November 15, 1965. Therefore, at least two cars are using the GE J79 engine.

In [98]:
 chunk in rag_chain.stream("count all cars that are using GE J79 Engine?"):
    print(chunk, end="", flush=True)

SyntaxError: invalid syntax (1813650552.py, line 1)

In [97]:
for chunk in rag_chain.stream("How many cars are using GE J79 Engine?"):
    print(chunk, end="", flush=True)

Based on the context, the cars using the GE J79 engine are the Spirit of America and the Green Monster.

In [99]:
for chunk in rag_chain.stream("Count how many cars are using GE J79 Engine?"):
    print(chunk, end="", flush=True)

According to the context, three cars are using the GE J79 Engine: Sonic 1 (2 times) and the Green Monster.

In [100]:
for chunk in rag_chain.stream("list down all speeds mentioned by lsrinfo.asp"):
    print(chunk, end="", flush=True)

According to the context, the speeds mentioned are:

* 394.196 mph (Floyd Crosby, Blue Flame, W-11, August 19, 1948)
* 650.092 mph (Richard Noble, Thrust2, Rolls-Royce RB.161, October 4, 1983)
* 763.03 mph (Andy Green, Thrust SSC, Rolls-Royce Spey, October 15, 1997)

Note that these speeds are listed in the table provided in the context.

In [101]:
for chunk in rag_chain.stream("list down all speeds drivers cars Engine and dates mentioned by lsrinfo.asp"):
    print(chunk, end="", flush=True)

Here are the speeds, drivers, cars, engines, and dates mentioned on the LSRinfo.asp webpage:

• 407.447 mph - Craig Breedlove - Spirit of America - GE J47 - 8/5/63
• 413.199 mph - Tom Green - Wingfoot Express - WE J46 - 10/2/64
• 434.22 mph - Art Arfons - Green Monster - GE J79 - 10/5/64

In [102]:
for chunk in rag_chain.stream("more"):
    print(chunk, end="", flush=True)

I don't know.

In [103]:
for chunk in rag_chain.stream("list down all speeds drivers cars Engine and dates mentioned by lsrinfo.asp"):
    print(chunk, end="", flush=True)

Here is the list of speeds, drivers, cars, engines, and dates mentioned:

* 407.447 mph - Craig Breedlove - Spirit of America - GE J47 - 8/5/63
* 413.199 mph - Tom Green - Wingfoot Express - WE J46 - 10/2/64
* 434.22 mph - Art Arfons - Green Monster - GE J79 - 10/5/64

In [None]:
for chunk in rag_chain.stream("list down all speeds drivers cars Engine and dates mentioned by lsrinfo.asp till 526.277"):
    print(chunk, end="", flush=True)