## Ingesting PDF

In [1]:
# !pip install langchain

In [2]:
from langchain.document_loaders import PyPDFLoader

pdf_path = "phy_book_ch2.pdf"
if pdf_path:
    loader = PyPDFLoader(file_path=pdf_path)
    data = loader.load()
else:
    print("PDF not found")

## Data Inspection

In [3]:
# Preview first page
print(data[16].page_content)
# data

42  Physics 
 Distance – Time table  
Time, t (min)  Distance s km  
0 0 
12 6 
24 12 
36 18 
48 24 
60 30 
                     Table 2.3      Figure 2.5 
For above mentioned motion in the table a distance-time graph shown in fig 2.5. 
Suppose from the graph we have to find the traveled distance in 32 minute by the auto-rickshaw; we have to mark a point (M) to indicate time, 32 minute on X-axis. Then we have to draw a line parallel to Y-axis from that point on the graph. Let the line at point P. Now draw a perpendicular on Y-axis from P. This perpendicular meets at point N on Y-axis. Therefore, ON is the distance traveled in 32 minutes. The graph shows that the auto-rickshaw travels 16km in this time. Therefore, from graph we find any traveled 
distance S = PM for any time t = OM. 
∴ Velocity = distance
time
    = PM
OM    = ON
OM , Here,  PM
OM is the slope of the OP. 
  
           
 
(B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a 
body moving wi

In [4]:
# Elements in the pdf
from pdfminer.high_level import extract_pages

element_types = set()

for page_layout in extract_pages(pdf_path):
    for element in page_layout:
        element_types.add(type(element).__name__)

print(list(element_types))

['LTFigure', 'LTLine', 'LTRect', 'LTTextBoxHorizontal', 'LTCurve', 'LTTextLineHorizontal']


## Remove Extra Whitespace

In [5]:
import re
from langchain.schema import Document

# Remove extra whitespace
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text

cleaned_data = []
for doc in data:
    cleaned_data.append(Document(page_content=clean_text(doc.page_content), metadata=doc.metadata))


In [6]:
print(cleaned_data[16].page_content)

42 Physics Distance – Time table Time, t (min) Distance s km 0 0 12 6 24 12 36 18 48 24 60 30 Table 2.3 Figure 2.5 For above mentioned motion in the table a distance-time graph shown in fig 2.5. Suppose from the graph we have to find the traveled distance in 32 minute by the auto-rickshaw; we have to mark a point (M) to indicate time, 32 minute on X-axis. Then we have to draw a line parallel to Y-axis from that point on the graph. Let the line at point P. Now draw a perpendicular on Y-axis from P. This perpendicular meets at point N on Y-axis. Therefore, ON is the distance traveled in 32 minutes. The graph shows that the auto-rickshaw travels 16km in this time. Therefore, from graph we find any traveled distance S = PM for any time t = OM. ∴ Velocity = distance time = PM OM = ON OM , Here, PM OM is the slope of the OP. (B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a body moving with non-uniform velocity. In this case the body does not move over equal

## Handle Math Expressions

In [7]:
import re
from langchain.schema import Document

# Patterns to detect mathematical expressions
patterns = [
    r'∴.*',        # Lines starting with '∴' (therefore)
    r'=[^=]*',     # Equal signs
    r'\b[^\s]+/[^\s]+\b',  # Fractions like 'distance/time'
]

def extract_math_expressions(text):
    math_expressions = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        math_expressions.extend(matches)
    return math_expressions

In [8]:
# test on a sample
sample_text = """
The equation of motion is s = 5 * t^2 + 3 * t + 2, and velocity is given by v = ds / dt.
∴ Velocity = distance / time = PM / OM = ON / OM.
"""

expr = extract_math_expressions(sample_text)
for i in expr: print(i)

∴ Velocity = distance / time = PM / OM = ON / OM.
= 5 * t^2 + 3 * t + 2, and velocity is given by v 
= ds / dt.
∴ Velocity 
= distance / time 
= PM / OM 
= ON / OM.



In [9]:
# Extract matching patterns from pdf
for doc in cleaned_data:
    math_exprs = extract_math_expressions(doc.page_content)
    
    if isinstance(math_exprs, list):
        math_exprs = "; ".join(math_exprs)  # Join the list elements into a single string
    
    # if not isinstance(math_exprs, (str,int,float,bool)): math_exprs = str(math_exprs)
    
    doc.metadata['math_expressions'] = math_exprs

In [10]:
cleaned_data[16].metadata['math_expressions']

'∴ Velocity = distance time = PM OM = ON OM , Here, PM OM is the slope of the OP. (B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a body moving with non-uniform velocity. In this case the body does not move over equal distance in equal intervals of time, so the graph will not be a straight line. It will be a curved line. Since, the body is not moving with uniform velocity, its velocity will not be the same at all instants during its motion. Suppose, the velocity of the body at a particular instant indicated by the point P in the curved line, is to be determined. To determine the velocity at the point P, we will have to consider a very small right angled triangle ABC. Its hypotenuse AB is so small Do by yourself : Take a graph paper. Draw the distance-time graph on the graph paper using any convenient unit mentioned in the table above. Find the distance traveled and velocity of the auto-rickshaw in 32 minutes from the graph. What will be the distance tr

In [11]:
type(cleaned_data[16].metadata['math_expressions'])

str

In [12]:
# for i in cleaned_data[16].metadata['math_expressions']: print(i)

## Process Tables

In [13]:
# extract tables
import pdfplumber
tables = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        table = page.extract_table()
        if table:
            tables.append(table)

In [14]:
tables[0]

[['Scalar Quantity', None, None, 'Vector quantity', None, None],
 ['Distance', 'd', '40m', 'Displacement', 's', '40m east direction'],
 ['Speed', 'v', '30ms-1', 'Velocity', 'v', '30 ms-1 north direction'],
 ['Time', 't', '15s', 'Force', 'F', '100N upward direction'],
 ['Energy', 'E', '2000j', 'Acceleration', 'a', '98ms-2 downward direction']]

In [15]:
len(tables)

6

In [16]:
# Convert tables to text
import pandas as pd

table_dfs = [pd.DataFrame(table[1:], columns=table[0]) for table in tables]
table_texts = [df.to_string(index=False) for df in table_dfs]

In [17]:
table_dfs[0]

Unnamed: 0,Scalar Quantity,None,None.1,Vector quantity,None.2,None.3
0,Distance,d,40m,Displacement,s,40m east direction
1,Speed,v,30ms-1,Velocity,v,30 ms-1 north direction
2,Time,t,15s,Force,F,100N upward direction
3,Energy,E,2000j,Acceleration,a,98ms-2 downward direction


In [18]:
table_texts[0]

'Scalar Quantity None   None Vector quantity None                      None\n       Distance    d    40m    Displacement    s        40m east direction\n          Speed    v 30ms-1        Velocity    v   30 ms-1 north direction\n           Time    t    15s           Force    F     100N upward direction\n         Energy    E  2000j    Acceleration    a 98ms-2 downward direction'

In [19]:
# Convert table texts to Document objects with metadata
table_data = [Document(page_content=clean_text(text), metadata={'source': 'table'}) for text in table_texts]

# Combine cleaned data and table documents
combined_data = cleaned_data + table_data

In [20]:
# new size of data
print("size of cleaned_data:\t",len(cleaned_data))
print("size of table_data:\t",len(table_data))
print("size of combined_data:\t",len(combined_data))


size of cleaned_data:	 22
size of table_data:	 6
size of combined_data:	 28


In [21]:
combined_data[0]

Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 0, 'math_expressions': ''}, page_content='26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life ')

In [22]:
table_data[-5:]

[Document(metadata={'source': 'table'}, page_content='Serial No Time (s) Velocity (kmh-1) Velocity (ms-1) 1. 0 0 0 2. 8 14.4 4 3. 16 28.8 8 4. 24 43.2 12 5. 32 57.6 16 6. 40 72.0 20'),
 Document(metadata={'source': 'table'}, page_content='Time, t (min) Distance s km 0 0 12 6 24\\n36 12\\n18 48 24 60 30'),
 Document(metadata={'source': 'table'}, page_content=' Do by yourself : The velocity of a car after every five seconds is given in the table below. None None None None None None None None None None None Table : 2.4 None None None None None None None None None None None None Time (s) Velocity (kmh-1) Velocity (ms-1) None None 0 0 0 None None 5 9 2.5 None None 10 18 5.0 None None 15 27 7.5 None None 20 36 10.0 None None 25 45 12.5 None None 30 54 15.0 None None Take a graph paper. Draw the velocity-time graph on the graph paper using any convenient None None None None None None None None None None None None unit mentioned in the table above. Find the velocity and acceleration of the car

In [23]:
combined_data[-5:]

[Document(metadata={'source': 'table'}, page_content='Serial No Time (s) Velocity (kmh-1) Velocity (ms-1) 1. 0 0 0 2. 8 14.4 4 3. 16 28.8 8 4. 24 43.2 12 5. 32 57.6 16 6. 40 72.0 20'),
 Document(metadata={'source': 'table'}, page_content='Time, t (min) Distance s km 0 0 12 6 24\\n36 12\\n18 48 24 60 30'),
 Document(metadata={'source': 'table'}, page_content=' Do by yourself : The velocity of a car after every five seconds is given in the table below. None None None None None None None None None None None Table : 2.4 None None None None None None None None None None None None Time (s) Velocity (kmh-1) Velocity (ms-1) None None 0 0 0 None None 5 9 2.5 None None 10 18 5.0 None None 15 27 7.5 None None 20 36 10.0 None None 25 45 12.5 None None 30 54 15.0 None None Take a graph paper. Draw the velocity-time graph on the graph paper using any convenient None None None None None None None None None None None None unit mentioned in the table above. Find the velocity and acceleration of the car

## Split and Chunking

In [24]:
# !pip install langchain_text_splitters

In [24]:
# Split and chunk
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, 
    chunk_overlap = 100,
    length_function = len,
)
chunked_document = text_splitter.split_documents(combined_data)

In [25]:
chunked_document[0]

Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 0, 'math_expressions': ''}, page_content='26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life')

In [26]:
print(chunked_document[0].page_content)

26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life


## Vector Embedding

In [28]:
# %pip install langchain_community
# %pip install -qU langchain-huggingface

In [27]:
# # Embed the chunked document using OllamaEmbeddings (nomic-embed-text)

# from langchain_community.embeddings import OllamaEmbeddings
# embedding = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)

# # embedded_texts = embedding.embed_documents(chunked_document[x].page_content for x in chunked_document[len(chunked_document)-1])


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)


In [31]:
# # Embed the chunked document using SentenceTransformer (all-MiniLM-L6-v2)

# # from sentence_transformers import SentenceTransformer
# # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange





In [39]:
# # Example usage
# sentence = "This is a test sentence."
# embedding = model.encode(sentence)

# print("Sentence embedding:", embedding)

## Store in ChromaDB

### Filter Metadata before storing in ChromaDB

In [28]:
def filter_complex_metadata(metadata):
    """
    Filter out complex metadata values. Only keep str, int, float, or bool values.
    """
    filtered_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            filtered_metadata[key] = value
        else:
            filtered_metadata[key] = str(value)  # Convert complex types to string
    return filtered_metadata

# Apply metadata filtering
filtered_metadatas = [filter_complex_metadata(doc.metadata) for doc in chunked_document]


In [32]:
# Create the Chroma vector store
from langchain_chroma.vectorstores import Chroma

vector_db = Chroma.from_documents(
    documents=chunked_document,
    embedding=embedding,
    collection_name="local-rag",
    persist_directory="./db/db_minilm"
)

print("Embedded Documents stored in ChromaDB successfully!")

Embedded Documents stored in ChromaDB successfully!
