## Ingesting PDF

In [1]:
# !pip install langchain

In [2]:
from langchain.document_loaders import PyPDFLoader

pdf_path = "phy_book_ch2.pdf"
if pdf_path:
    loader = PyPDFLoader(file_path=pdf_path)
    data = loader.load()
else:
    print("PDF not found")

## Data Inspection

In [3]:
# Preview first page
print(data[16].page_content)
# data

42  Physics 
 Distance – Time table  
Time, t (min)  Distance s km  
0 0 
12 6 
24 12 
36 18 
48 24 
60 30 
                     Table 2.3      Figure 2.5 
For above mentioned motion in the table a distance-time graph shown in fig 2.5. 
Suppose from the graph we have to find the traveled distance in 32 minute by the auto-rickshaw; we have to mark a point (M) to indicate time, 32 minute on X-axis. Then we have to draw a line parallel to Y-axis from that point on the graph. Let the line at point P. Now draw a perpendicular on Y-axis from P. This perpendicular meets at point N on Y-axis. Therefore, ON is the distance traveled in 32 minutes. The graph shows that the auto-rickshaw travels 16km in this time. Therefore, from graph we find any traveled 
distance S = PM for any time t = OM. 
∴ Velocity = distance
time
    = PM
OM    = ON
OM , Here,  PM
OM is the slope of the OP. 
  
           
 
(B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a 
body moving wi

In [4]:
# Elements in the pdf
from pdfminer.high_level import extract_pages

element_types = set()

for page_layout in extract_pages(pdf_path):
    for element in page_layout:
        element_types.add(type(element).__name__)

print(list(element_types))

['LTCurve', 'LTRect', 'LTFigure', 'LTLine', 'LTTextBoxHorizontal', 'LTTextLineHorizontal']


## Remove Extra Whitespace

In [5]:
import re
from langchain.schema import Document

# Remove extra whitespace
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text

cleaned_data = []
for doc in data:
    cleaned_data.append(Document(page_content=clean_text(doc.page_content), metadata=doc.metadata))


In [6]:
print(cleaned_data[16].page_content)

42 Physics Distance – Time table Time, t (min) Distance s km 0 0 12 6 24 12 36 18 48 24 60 30 Table 2.3 Figure 2.5 For above mentioned motion in the table a distance-time graph shown in fig 2.5. Suppose from the graph we have to find the traveled distance in 32 minute by the auto-rickshaw; we have to mark a point (M) to indicate time, 32 minute on X-axis. Then we have to draw a line parallel to Y-axis from that point on the graph. Let the line at point P. Now draw a perpendicular on Y-axis from P. This perpendicular meets at point N on Y-axis. Therefore, ON is the distance traveled in 32 minutes. The graph shows that the auto-rickshaw travels 16km in this time. Therefore, from graph we find any traveled distance S = PM for any time t = OM. ∴ Velocity = distance time = PM OM = ON OM , Here, PM OM is the slope of the OP. (B) In case of Non-uniform velocity : Fig 2.6 represents a distance-time graph of a body moving with non-uniform velocity. In this case the body does not move over equal

## Handle Math Expressions

In [7]:
import re
from langchain.schema import Document

# patterns = [
#     r'[A-Za-z]+\s*=\s*[^\n]*[A-Za-z0-9+\-*/^()]+',  # Complete equations on a single line (e.g., s = 5 * t^2 + 3 * t + 2)
#     r'[A-Za-z]+\s*=\s*[\dA-Za-z+\-*/^()]+\s*',      # Simple equations
#     r'∴\s*[^=]*',                                   # Starting with '∴' (therefore)
# ]

patterns = [
    r'\b\d+(\.\d+)?\s*[-+*/^]\s*\d+(\.\d+)?\b',  # Basic arithmetic operations (e.g., 3 + 5, 4.2 * 7)
    r'[A-Za-z]+\s*=\s*[\dA-Za-z+\-*/^()]+',       # Simple equations (e.g., x = 2 + 3)
    r'\b\d+\s*[+\-*/^()]+\s*\d+',                 # Algebraic expressions (e.g., 2 * (3 + 4))
    r'\b[A-Za-z]+\s*\d*[_^]\d+\b',                # Variables with subscripts or superscripts (e.g., x_2, y^2)
    r'\b\w+\s*\(.*?\)\b',                         # Function calls (e.g., sin(x), log(2))
    r'\b\w+\s*=\s*\w+\s*[-+*/]\s*\w+',            # Equations with variable operations (e.g., y = x + z)
    r'\d+(\.\d+)?\s*[-+*/^()]+\d+',               # Numeric expressions (e.g., 5^2, (3.14 * 2) + 4)
    r'\b\w+\s*\(\s*\w+\s*\)\s*=\s*[\w\d+\-*/^()]+',# Function definitions (e.g., f(x) = x^2 + 2x)
    r'[\dA-Za-z]+\s*=\s*[\dA-Za-z+\-*/^()]+',      # General form of equations (e.g., E = mc^2)
]

def expr_cleanup(math_expressions):
    # Remove duplicates by converting the list to a set
    unique_expressions = list(set(math_expressions))
    # Sort by length to filter out partial expressions
    unique_expressions.sort(key=lambda x: len(x), reverse=True)
    # Remove partial matches
    final_expressions = []
    for i, expr in enumerate(unique_expressions):
        # Ensure we don't include plain text or partial expressions
        if not any(expr in larger_expr for larger_expr in unique_expressions[:i]):
            # Check for presence of mathematical symbols to remove plain text
            if re.search(r'[=+\-*/^]', expr):
                final_expressions.append(expr)
    return final_expressions

def extract_math_expressions(text):
    math_expressions = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        # math_expressions.extend([match.strip() for match in matches])
        math_expressions.extend([match if isinstance(match, str) else ''.join(match) for match in matches])
    return expr_cleanup(math_expressions)

In [8]:
# test on a sample
sample_text = """
The equation of motion is s = 5 * t^2 + 3 * t + 2, and velocity is given by v = ds / dt.
∴ Velocity = distance / time = PM / OM = ON / OM.
"""

expr = extract_math_expressions(sample_text)
for i in expr: print(i)

Velocity = distance / time
OM = ON / OM
v = ds / dt
s = 5 * t
time = PM
2 + 3
t^2


In [9]:
# Extract matching patterns from pdf
for doc in cleaned_data:
    math_exprs = extract_math_expressions(doc.page_content)
    
    if isinstance(math_exprs, list):
        math_exprs = "; ".join(math_exprs)  # Join the list elements into a single string
    
    # if not isinstance(math_exprs, (str,int,float,bool)): math_exprs = str(math_exprs)
    
    doc.metadata['math_expressions'] = math_exprs

In [10]:
cleaned_data[16].metadata['math_expressions']

'Velocity = distance; time = PM; OM = ON; S = PM; t = OM'

In [11]:
type(cleaned_data[16].metadata['math_expressions'])

str

In [12]:
# for i in cleaned_data[16].metadata['math_expressions']: print(i)

## Process Tables

In [13]:
# extract tables
import pdfplumber
tables = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        table = page.extract_table()
        if table:
            tables.append(table)

In [14]:
tables[0]

[['Scalar Quantity', None, None, 'Vector quantity', None, None],
 ['Distance', 'd', '40m', 'Displacement', 's', '40m east direction'],
 ['Speed', 'v', '30ms-1', 'Velocity', 'v', '30 ms-1 north direction'],
 ['Time', 't', '15s', 'Force', 'F', '100N upward direction'],
 ['Energy', 'E', '2000j', 'Acceleration', 'a', '98ms-2 downward direction']]

In [15]:
len(tables)

6

In [16]:
# Convert tables to DataFrames
import pandas as pd

table_dfs = [pd.DataFrame(table[1:], columns=table[0]) for table in tables]
# table_texts = [df.to_string(index=False) for df in table_dfs]

In [17]:
# table_texts[0]

In [18]:
table_dfs[0]

Unnamed: 0,Scalar Quantity,None,None.1,Vector quantity,None.2,None.3
0,Distance,d,40m,Displacement,s,40m east direction
1,Speed,v,30ms-1,Velocity,v,30 ms-1 north direction
2,Time,t,15s,Force,F,100N upward direction
3,Energy,E,2000j,Acceleration,a,98ms-2 downward direction


In [19]:
# Function to clean up columns: remove empty columns and make names unique
def clean_dataframe(df):
    # Remove empty or None columns
    df = df.loc[:, df.columns.notnull()]  # Keep columns that are not None
    df = df.loc[:, df.columns != '']  # Keep columns that are not empty strings
    
    # Check for duplicates
    if not df.columns.is_unique:
        print(f"Duplicate columns found in DataFrame:\n{df.columns[df.columns.duplicated()]}")
        
    # Make columns unique
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():  # Find duplicates
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    
    return df

In [20]:
# Function to ensure unique column names
def make_columns_unique(df):
    if not df.columns.is_unique:
        print(f"Duplicate columns found in DataFrame:\n{df.columns[df.columns.duplicated()]}")
        
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():  # Find duplicates
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

In [21]:
# Convert DataFrames to JSON
table_jsons = []
for df in table_dfs:
    df_cleaned = clean_dataframe(df)  # Clean the DataFrame
    print(f"Unique columns for cleaned DataFrame:\n{df_cleaned.columns}")
    # Convert to JSON only if the DataFrame is not empty
    if not df_cleaned.empty:
        json_str = df_cleaned.to_json(orient='records')
        table_jsons.append(json_str)  # Append only non-empty JSON strings
    # table_jsons.append(df_cleaned.to_json(orient='records'))

Unique columns for cleaned DataFrame:
Index(['Scalar Quantity', 'Vector quantity'], dtype='object')
Unique columns for cleaned DataFrame:
Index(['Serial No', 'Time (s)', 'Velocity (kmh-1)', 'Velocity (ms-1)'], dtype='object')
Unique columns for cleaned DataFrame:
Index(['Time, t (min)', 'Distance s km'], dtype='object')
Unique columns for cleaned DataFrame:
Index(['Do by yourself : The velocity of a car after every five seconds is given in the table below.'], dtype='object')
Unique columns for cleaned DataFrame:
Index(['Reading', 'Traveled distance i.e. length of\nthe plank', 'Time (s)',
       'Average speed'],
      dtype='object')
Unique columns for cleaned DataFrame:
Index(['Reading', 'Traveled distance (m)', 'Time (s)',
       'Average speed = distance /\ntime'],
      dtype='object')


In [22]:
for i in table_jsons: print(i)

[{"Scalar Quantity":"Distance","Vector quantity":"Displacement"},{"Scalar Quantity":"Speed","Vector quantity":"Velocity"},{"Scalar Quantity":"Time","Vector quantity":"Force"},{"Scalar Quantity":"Energy","Vector quantity":"Acceleration"}]
[{"Serial No":"1.","Time (s)":"0","Velocity (kmh-1)":"0","Velocity (ms-1)":"0"},{"Serial No":"2.","Time (s)":"8","Velocity (kmh-1)":"14.4","Velocity (ms-1)":"4"},{"Serial No":"3.","Time (s)":"16","Velocity (kmh-1)":"28.8","Velocity (ms-1)":"8"},{"Serial No":"4.","Time (s)":"24","Velocity (kmh-1)":"43.2","Velocity (ms-1)":"12"},{"Serial No":"5.","Time (s)":"32","Velocity (kmh-1)":"57.6","Velocity (ms-1)":"16"},{"Serial No":"6.","Time (s)":"40","Velocity (kmh-1)":"72.0","Velocity (ms-1)":"20"}]
[{"Time, t (min)":"0","Distance s km":"0"},{"Time, t (min)":"12","Distance s km":"6"},{"Time, t (min)":"24\n36","Distance s km":"12\n18"},{"Time, t (min)":"48","Distance s km":"24"},{"Time, t (min)":"60","Distance s km":"30"}]
[{"Do by yourself : The velocity of a

In [23]:
len(table_jsons)

6

In [24]:
# Create documents for the tables

# table_data = [Document(page_content=text, metadata={'source': 'table'}) for text in table_jsons]
table_data = [Document(page_content=text, metadata={'source': 'table', 'page': i, 'type': "table"}) for i, text in enumerate (table_jsons)]

# table_data = [
#     {"page_content": text, "metadata": {"type": "table", "source": "table", "page": i}} 
#     for i, text in enumerate(table_jsons)
# ]

In [25]:
table_data[:5]

[Document(metadata={'source': 'table', 'page': 0, 'type': 'table'}, page_content='[{"Scalar Quantity":"Distance","Vector quantity":"Displacement"},{"Scalar Quantity":"Speed","Vector quantity":"Velocity"},{"Scalar Quantity":"Time","Vector quantity":"Force"},{"Scalar Quantity":"Energy","Vector quantity":"Acceleration"}]'),
 Document(metadata={'source': 'table', 'page': 1, 'type': 'table'}, page_content='[{"Serial No":"1.","Time (s)":"0","Velocity (kmh-1)":"0","Velocity (ms-1)":"0"},{"Serial No":"2.","Time (s)":"8","Velocity (kmh-1)":"14.4","Velocity (ms-1)":"4"},{"Serial No":"3.","Time (s)":"16","Velocity (kmh-1)":"28.8","Velocity (ms-1)":"8"},{"Serial No":"4.","Time (s)":"24","Velocity (kmh-1)":"43.2","Velocity (ms-1)":"12"},{"Serial No":"5.","Time (s)":"32","Velocity (kmh-1)":"57.6","Velocity (ms-1)":"16"},{"Serial No":"6.","Time (s)":"40","Velocity (kmh-1)":"72.0","Velocity (ms-1)":"20"}]'),
 Document(metadata={'source': 'table', 'page': 2, 'type': 'table'}, page_content='[{"Time, t (

In [26]:
# combine table data with pdf text contents
combined_data = cleaned_data + table_data

In [27]:
# new size of data
print("size of cleaned_data:\t",len(cleaned_data))
print("size of table_data:\t",len(table_data))
print("size of combined_data:\t",len(combined_data))


size of cleaned_data:	 22
size of table_data:	 6
size of combined_data:	 28


In [28]:
combined_data[0]

Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 0, 'math_expressions': ''}, page_content='26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life ')

In [29]:
table_data[-5:]

[Document(metadata={'source': 'table', 'page': 1, 'type': 'table'}, page_content='[{"Serial No":"1.","Time (s)":"0","Velocity (kmh-1)":"0","Velocity (ms-1)":"0"},{"Serial No":"2.","Time (s)":"8","Velocity (kmh-1)":"14.4","Velocity (ms-1)":"4"},{"Serial No":"3.","Time (s)":"16","Velocity (kmh-1)":"28.8","Velocity (ms-1)":"8"},{"Serial No":"4.","Time (s)":"24","Velocity (kmh-1)":"43.2","Velocity (ms-1)":"12"},{"Serial No":"5.","Time (s)":"32","Velocity (kmh-1)":"57.6","Velocity (ms-1)":"16"},{"Serial No":"6.","Time (s)":"40","Velocity (kmh-1)":"72.0","Velocity (ms-1)":"20"}]'),
 Document(metadata={'source': 'table', 'page': 2, 'type': 'table'}, page_content='[{"Time, t (min)":"0","Distance s km":"0"},{"Time, t (min)":"12","Distance s km":"6"},{"Time, t (min)":"24\\n36","Distance s km":"12\\n18"},{"Time, t (min)":"48","Distance s km":"24"},{"Time, t (min)":"60","Distance s km":"30"}]'),
 Document(metadata={'source': 'table', 'page': 3, 'type': 'table'}, page_content='[{"Do by yourself : T

In [30]:
combined_data[-5:]

[Document(metadata={'source': 'table', 'page': 1, 'type': 'table'}, page_content='[{"Serial No":"1.","Time (s)":"0","Velocity (kmh-1)":"0","Velocity (ms-1)":"0"},{"Serial No":"2.","Time (s)":"8","Velocity (kmh-1)":"14.4","Velocity (ms-1)":"4"},{"Serial No":"3.","Time (s)":"16","Velocity (kmh-1)":"28.8","Velocity (ms-1)":"8"},{"Serial No":"4.","Time (s)":"24","Velocity (kmh-1)":"43.2","Velocity (ms-1)":"12"},{"Serial No":"5.","Time (s)":"32","Velocity (kmh-1)":"57.6","Velocity (ms-1)":"16"},{"Serial No":"6.","Time (s)":"40","Velocity (kmh-1)":"72.0","Velocity (ms-1)":"20"}]'),
 Document(metadata={'source': 'table', 'page': 2, 'type': 'table'}, page_content='[{"Time, t (min)":"0","Distance s km":"0"},{"Time, t (min)":"12","Distance s km":"6"},{"Time, t (min)":"24\\n36","Distance s km":"12\\n18"},{"Time, t (min)":"48","Distance s km":"24"},{"Time, t (min)":"60","Distance s km":"30"}]'),
 Document(metadata={'source': 'table', 'page': 3, 'type': 'table'}, page_content='[{"Do by yourself : T

In [31]:
combined_data[:5]

[Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 0, 'math_expressions': ''}, page_content='26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life '),
 Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 1, 'math_exp

## Split and Chunking

In [32]:
# !pip install langchain_text_splitters

In [33]:
# Split and chunk
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 7500, 
    chunk_overlap = 100,
    length_function = len,
)
chunked_document = text_splitter.split_documents(combined_data)

In [34]:
print(len(combined_data))
print(len(chunked_document))

28
28


In [35]:
chunked_document[:5]

[Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 0, 'math_expressions': ''}, page_content='26 Physics Chapter Two MOTION [The object, that we see around us either are stationery or in motion. What do we actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the relations among them etc.] By the end of this chapter we will be able to - 1. Explain the rest and motion 2. Find out the difference among different types of motion. 3. Explain the scalar and vector quantities 4. Analyze the relation among the quantities regarding motion 5. Explain the motion of freely falling bodies 6. Analyze the relations among the quantities regarding motion with the help of graph 7. Realize the effect of motion in our life'),
 Document(metadata={'source': 'phy_book_ch2.pdf', 'page': 1, 'math_expr

In [36]:
# len(chunked_document[12].metadata['math_expressions'])
# len(chunked_document[30].page_content)

In [37]:
type(chunked_document)

list

## Filter Metadata before storing in ChromaDB

In [38]:
# def filter_complex_metadata(metadata):
#     """
#     Filter out complex metadata values. Only keep str, int, float, or bool values.
#     """
#     filtered_metadata = {}
#     for key, value in metadata.items():
#         if isinstance(value, (str, int, float, bool)):
#             filtered_metadata[key] = value
#         else:
#             filtered_metadata[key] = str(value)  # Convert complex types to string
#     return filtered_metadata

# # Apply metadata filtering
# filtered_metadatas = [filter_complex_metadata(doc.metadata) for doc in chunked_document]

## Vector Embedding

In [45]:
# Embed the chunked document using OllamaEmbeddings (nomic-embed-text)
from langchain_ollama import OllamaEmbeddings
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

In [41]:
# # # Embed the chunked document using SentenceTransformer (all-MiniLM-L6-v2)
# from langchain_huggingface import HuggingFaceEmbeddings
# embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange





In [43]:
# Example usage
sentence = "This is a test sentence."
embedding = embedding_model.embed_documents(sentence)
print("Sentence embedding:", embedding)

Sentence embedding: [[-0.058957457542419434, 0.05387329310178757, 0.008316757157444954, 0.020680494606494904, 0.002200970659032464, -0.03443445265293121, 0.08473175764083862, 0.10776820778846741, 0.05895735323429108, -0.01663864217698574, 0.06211816892027855, -0.07404918968677521, -0.03030318021774292, 0.038075823336839676, -0.004251330159604549, -0.007986344397068024, 0.0031453315168619156, -0.07220275700092316, -0.015011931769549847, -0.07136203348636627, -0.08817398548126221, 0.04520198330283165, 0.009071133099496365, 0.041956428438425064, -0.0012929183430969715, -0.014840221963822842, -0.039426445960998535, 0.007077444344758987, -0.036413632333278656, -0.04289919510483742, -0.04982231557369232, 0.009418360888957977, -0.02269933931529522, 0.010238062590360641, -0.004453901667147875, -0.07801614701747894, -0.07069573551416397, -0.005806398577988148, 0.038758136332035065, 0.012042265385389328, -0.05325376242399216, -0.09637930244207382, -0.02809724025428295, -0.00988378468900919, 0.06

## Store in ChromaDB

In [46]:
# Create the Chroma vector store
from langchain_chroma.vectorstores import Chroma
try:
    vector_db = Chroma.from_documents(
        documents=chunked_document,
        embedding=embedding_model,
        collection_name="local-rag",
        persist_directory="./db/db_nomic"
    )
    print("Embedded Documents stored in ChromaDB successfully!")
except Exception as e:
    print(f"An error occurred: {e}")


Embedded Documents stored in ChromaDB successfully!
