## Ingesting PDF

In [263]:
# !pip install langchain

In [264]:
from langchain.document_loaders import PyPDFLoader

pdf_path = "phy_book_ch2.pdf"
if pdf_path:
    loader = PyPDFLoader(file_path=pdf_path)
    data = loader.load()
else:
    print("PDF not found")

In [265]:
# Preview first page
print(data[16].page_content)
# data

42  Physics 
 Distance – Time table  
Time, t (min)  Distance s km  
0 0 
12 6 
24 12 
36 18 
48 24 
60 30 
                     Table 2.3      Figure 2.5 
For above mentioned motion in the table a distance-time graph shown in fig 2.5. 
Suppose from the graph we have to find the traveled distance in 32 minute by the auto-rickshaw; we have to mark a point (M) to indicate time, 32 minute on X-axis. Then we have to draw a line parallel to Y-axis from that point on the graph. Let the line at point P. Now draw a perpendicular on Y-axis from P. This perpendicular meets at point N on Y-axis. Therefore, ON is the distance traveled in 32 minutes. The graph shows that the auto-rickshaw travels 16km in this time. Therefore, from graph we find any traveled 
distance S = PM for any time t = OM. 
∴ Velocity = distance
time
    = PM
OM    = ON
OM , Here,  PM
OM is the slope of the OP. 
  
           
 
(B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a 
body moving wi

## Clean Text Data

In [266]:
import re
from langchain.schema import Document

# Remove extra whitespace
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text

cleaned_data = []
for doc in data:
    cleaned_data.append(Document(page_content=clean_text(doc.page_content), metadata=doc.metadata))


In [267]:
print(cleaned_data[16].page_content)

42 Physics Distance – Time table Time, t (min) Distance s km 0 0 12 6 24 12 36 18 48 24 60 30 Table 2.3 Figure 2.5 For above mentioned motion in the table a distance-time graph shown in fig 2.5. Suppose from the graph we have to find the traveled distance in 32 minute by the auto-rickshaw; we have to mark a point (M) to indicate time, 32 minute on X-axis. Then we have to draw a line parallel to Y-axis from that point on the graph. Let the line at point P. Now draw a perpendicular on Y-axis from P. This perpendicular meets at point N on Y-axis. Therefore, ON is the distance traveled in 32 minutes. The graph shows that the auto-rickshaw travels 16km in this time. Therefore, from graph we find any traveled distance S = PM for any time t = OM. ∴ Velocity = distance time = PM OM = ON OM , Here, PM OM is the slope of the OP. (B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a body moving with non-uniform velocity. In this case the body does not move over equal

## Handle Math Expressions

In [268]:
import re
from langchain.schema import Document

# Patterns to detect mathematical expressions
patterns = [
    r'∴.*',        # Lines starting with '∴' (therefore)
    r'=[^=]*',     # Equal signs
    r'\b[^\s]+/[^\s]+\b',  # Fractions like 'distance/time'
]

def extract_math_expressions(text):
    math_expressions = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        math_expressions.extend(matches)
    return math_expressions


for doc in cleaned_data:
    math_exprs = extract_math_expressions(doc.page_content)
    if isinstance(math_exprs, list):
        math_exprs = "; ".join(math_exprs)  # Join the list elements into a single string
    
    # if not isinstance(math_exprs, (str,int,float,bool)): math_exprs = str(math_exprs)
    
    doc.metadata['math_expressions'] = math_exprs

In [269]:
cleaned_data[16].metadata['math_expressions']

'∴ Velocity = distance time = PM OM = ON OM , Here, PM OM is the slope of the OP. (B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a body moving with non-uniform velocity. In this case the body does not move over equal distance in equal intervals of time, so the graph will not be a straight line. It will be a curved line. Since, the body is not moving with uniform velocity, its velocity will not be the same at all instants during its motion. Suppose, the velocity of the body at a particular instant indicated by the point P in the curved line, is to be determined. To determine the velocity at the point P, we will have to consider a very small right angled triangle ABC. Its hypotenuse AB is so small Do by yourself : Take a graph paper. Draw the distance-time graph on the graph paper using any convenient unit mentioned in the table above. Find the distance traveled and velocity of the auto-rickshaw in 32 minutes from the graph. What will be the distance tr

In [270]:
type(cleaned_data[16].metadata['math_expressions'])

str

## Process Tables

In [271]:
# extract tables
import pdfplumber
tables = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        table = page.extract_table()
        if table:
            tables.append(table)

In [272]:
# tables[0]

In [273]:
# len(tables)

In [274]:
# Convert tables to text
import pandas as pd

table_dfs = [pd.DataFrame(table[1:], columns=table[0]) for table in tables]
table_texts = [df.to_string(index=False) for df in table_dfs]

In [275]:
# table_dfs[0]

In [276]:
# table_texts[0]

In [277]:
# Convert table texts to Document objects with metadata
table_documents = [Document(page_content=text, metadata={'source': 'table'}) for text in table_texts]

# Combine cleaned data and table documents
combined_data = cleaned_data + table_documents

In [278]:
# combined_data[-5:]

In [279]:
# table_documents[-5:]

## Split and Chunking

In [280]:
# !pip install langchain_text_splitters

In [281]:
# Split and chunk
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
chunks = text_splitter.split_documents(combined_data)

In [282]:
chunks[0]

Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 0, 'math_expressions': ''}, page_content='26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life')

In [283]:
# Extract the texts and metadata from the chunks
page_contents = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]

In [284]:
page_contents[0]

'26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life'

## Vector Embedding

In [285]:
# !pip install langchain_community

In [286]:
# Embed the document texts using Ollama
from langchain_community.embeddings import OllamaEmbeddings

embedding = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)
embedded_texts = embedding.embed_documents(page_contents)

OllamaEmbeddings: 100%|██████████| 61/61 [02:13<00:00,  2.19s/it]


## Store in ChromaDB

In [287]:
# Create Document objects from texts and metadata
from langchain.schema import Document

chunked_documents  = [Document(page_content=content, metadata=meta) for content, meta in zip(page_contents, metadatas)]

In [288]:
chunked_documents[0]

Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 0, 'math_expressions': ''}, page_content='26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life')

In [289]:
type(str(chunked_documents[0].metadata))

str

### Filter Metadata before storing in ChromaDB

In [290]:
def filter_complex_metadata(metadata):
    """
    Filter out complex metadata values. Only keep str, int, float, or bool values.
    """
    filtered_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            filtered_metadata[key] = value
        else:
            filtered_metadata[key] = str(value)  # Convert complex types to string
    return filtered_metadata

# Apply metadata filtering
filtered_metadatas = [filter_complex_metadata(doc.metadata) for doc in chunked_documents]


In [291]:
# Create the Chroma vector store
from langchain_chroma.vectorstores import Chroma

vector_db = Chroma.from_documents(
    documents=chunked_documents,
    embedding=embedding,
    collection_name="local-rag",
    persist_directory="./db"
)

print("Embedded Documents stored in ChromaDB successfully!")

OllamaEmbeddings: 100%|██████████| 61/61 [02:13<00:00,  2.18s/it]

Embedded Documents stored in ChromaDB successfully!



