## Ingesting PDF

In [1]:
# !pip install langchain

In [1]:
from langchain.document_loaders import PyPDFLoader

pdf_path = "phy_book.pdf"
if pdf_path:
    loader = PyPDFLoader(file_path=pdf_path)
    data = loader.load()
else:
    print("PDF not found")

## Data Inspection

In [2]:
# Preview first page
print(data[16].page_content)
# data

12  Physics 
has the dimension of mass (1) dimension of length (1) dimension of time (-2). (The 
equation to express the dimension of physical quantity is called the dimensional 
equation). Third bracket [ ] is used to i ndicate dimensions in any quantity. As for 
example, the dimensional equation of force is [ F ] = [ MLT-2 ]  
Except these above mentioned three physical quantities of length, mass and time others 
dimension of physical quantities are :  
The dimension of temperature as θ (Capital alphabet of Greek letter θ), the dimension of 
electric current as I, the dimension of luminous intensity as J and the dimension of 
amount of substance as N.  
We can verify the validity of any equation or formula by analyzing dimension. For 
example, the following equation may be considered:  
                  S = ut + 1
2  at2 
We know that addition, subtraction or equiva lence are possible for any same kind of 
quantities. Hence every term of an equation must indicate the same kind of qu

In [3]:
# Elements in the pdf
from pdfminer.high_level import extract_pages

element_types = set()

for page_layout in extract_pages(pdf_path):
    for element in page_layout:
        element_types.add(type(element).__name__)

print(list(element_types))

['LTRect', 'LTTextLineHorizontal', 'LTTextBoxHorizontal', 'LTFigure', 'LTCurve', 'LTLine']


## Remove Extra Whitespace

In [4]:
import re
from langchain.schema import Document

# Remove extra whitespace
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text

cleaned_data = []
for doc in data:
    cleaned_data.append(Document(page_content=clean_text(doc.page_content), metadata=doc.metadata))


In [5]:
print(cleaned_data[16].page_content)

12 Physics has the dimension of mass (1) dimension of length (1) dimension of time (-2). (The equation to express the dimension of physical quantity is called the dimensional equation). Third bracket [ ] is used to i ndicate dimensions in any quantity. As for example, the dimensional equation of force is [ F ] = [ MLT-2 ] Except these above mentioned three physical quantities of length, mass and time others dimension of physical quantities are : The dimension of temperature as θ (Capital alphabet of Greek letter θ), the dimension of electric current as I, the dimension of luminous intensity as J and the dimension of amount of substance as N. We can verify the validity of any equation or formula by analyzing dimension. For example, the following equation may be considered: S = ut + 1 2 at2 We know that addition, subtraction or equiva lence are possible for any same kind of quantities. Hence every term of an equation must indicate the same kind of quantity, that is, the dimension of ever

## Handle Math Expressions

In [6]:
import re
from langchain.schema import Document

# patterns = [
#     r'[A-Za-z]+\s*=\s*[^\n]*[A-Za-z0-9+\-*/^()]+',  # Complete equations on a single line (e.g., s = 5 * t^2 + 3 * t + 2)
#     r'[A-Za-z]+\s*=\s*[\dA-Za-z+\-*/^()]+\s*',      # Simple equations
#     r'∴\s*[^=]*',                                   # Starting with '∴' (therefore)
# ]

patterns = [
    r'\b\d+(\.\d+)?\s*[-+*/^]\s*\d+(\.\d+)?\b',  # Basic arithmetic operations (e.g., 3 + 5, 4.2 * 7)
    r'[A-Za-z]+\s*=\s*[\dA-Za-z+\-*/^()]+',       # Simple equations (e.g., x = 2 + 3)
    r'\b\d+\s*[+\-*/^()]+\s*\d+',                 # Algebraic expressions (e.g., 2 * (3 + 4))
    r'\b[A-Za-z]+\s*\d*[_^]\d+\b',                # Variables with subscripts or superscripts (e.g., x_2, y^2)
    r'\b\w+\s*\(.*?\)\b',                         # Function calls (e.g., sin(x), log(2))
    r'\b\w+\s*=\s*\w+\s*[-+*/]\s*\w+',            # Equations with variable operations (e.g., y = x + z)
    r'\d+(\.\d+)?\s*[-+*/^()]+\d+',               # Numeric expressions (e.g., 5^2, (3.14 * 2) + 4)
    r'\b\w+\s*\(\s*\w+\s*\)\s*=\s*[\w\d+\-*/^()]+',# Function definitions (e.g., f(x) = x^2 + 2x)
    r'[\dA-Za-z]+\s*=\s*[\dA-Za-z+\-*/^()]+',      # General form of equations (e.g., E = mc^2)
]

def expr_cleanup(math_expressions):
    # Remove duplicates by converting the list to a set
    unique_expressions = list(set(math_expressions))
    # Sort by length to filter out partial expressions
    unique_expressions.sort(key=lambda x: len(x), reverse=True)
    # Remove partial matches
    final_expressions = []
    for i, expr in enumerate(unique_expressions):
        # Ensure we don't include plain text or partial expressions
        if not any(expr in larger_expr for larger_expr in unique_expressions[:i]):
            # Check for presence of mathematical symbols to remove plain text
            if re.search(r'[=+\-*/^]', expr):
                final_expressions.append(expr)
    return final_expressions

def extract_math_expressions(text):
    math_expressions = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        # math_expressions.extend([match.strip() for match in matches])
        math_expressions.extend([match if isinstance(match, str) else ''.join(match) for match in matches])
    return expr_cleanup(math_expressions)

In [7]:
# test on a sample
sample_text = """
The equation of motion is s = 5 * t^2 + 3 * t + 2, and velocity is given by v = ds / dt.
∴ Velocity = distance / time = PM / OM = ON / OM.
"""

expr = extract_math_expressions(sample_text)
for i in expr: print(i)

Velocity = distance / time
OM = ON / OM
v = ds / dt
s = 5 * t
time = PM
2 + 3
t^2


In [8]:
# Extract matching patterns from pdf
for doc in cleaned_data:
    math_exprs = extract_math_expressions(doc.page_content)
    
    if isinstance(math_exprs, list):
        math_exprs = "; ".join(math_exprs)  # Join the list elements into a single string
    
    # if not isinstance(math_exprs, (str,int,float,bool)): math_exprs = str(math_exprs)
    
    doc.metadata['math_expressions'] = math_exprs

In [9]:
cleaned_data[16].metadata['math_expressions']

'S = ut + 1; at2 = LT-2; ut = LT-1; T2 = LT-2; T = LT-1'

In [10]:
type(cleaned_data[16].metadata['math_expressions'])

str

In [11]:
# for i in cleaned_data[16].metadata['math_expressions']: print(i)

## Process Tables

In [12]:
# extract tables
import pdfplumber
tables = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        table = page.extract_table()
        if table:
            tables.append(table)

In [13]:
tables[0]

[['Chapter', 'Subject', 'Page No'],
 ['One', 'PHYSICAL QUANTITIES AND MEASUREMENT', '1-25'],
 ['Two', 'MOTION', '26-47'],
 ['Three', 'FORCE', '48-67'],
 ['Four', 'WORK, POWER AND ENERGY', '68-86'],
 ['Five', 'PRESSURE AND STATES OF MATTER', '87-99'],
 ['Six', 'EFFECT OF HEAT ON SUBSTANCES', '100-113'],
 ['Seven', 'WAVES AND SOUND', '114-125'],
 ['Eight', 'REFLECTION OF LIGHT', '126-141'],
 ['Nine', 'REFRACTION OF LIGHT', '142-159'],
 ['Ten', 'STATICAL ELECTRICITY', '160-174'],
 ['Eleven', 'CURRENT ELECTRICITY', '175-197'],
 ['Twelve', 'MAGNETIC EFFECT OF CURRENT', '198-208'],
 ['Thirteen', 'MODERN PHYSICS AND ELECTRONICS', '209-226'],
 ['Fourteen', 'PHYSICS TO SAVE LIFE', '227-240'],
 ['', None, None]]

In [14]:
len(tables)

68

In [15]:
# Convert tables to DataFrames
import pandas as pd

table_dfs = [pd.DataFrame(table[1:], columns=table[0]) for table in tables]
# table_texts = [df.to_string(index=False) for df in table_dfs]

In [16]:
# table_texts[0]

In [17]:
table_dfs[0]

Unnamed: 0,Chapter,Subject,Page No
0,One,PHYSICAL QUANTITIES AND MEASUREMENT,1-25
1,Two,MOTION,26-47
2,Three,FORCE,48-67
3,Four,"WORK, POWER AND ENERGY",68-86
4,Five,PRESSURE AND STATES OF MATTER,87-99
5,Six,EFFECT OF HEAT ON SUBSTANCES,100-113
6,Seven,WAVES AND SOUND,114-125
7,Eight,REFLECTION OF LIGHT,126-141
8,Nine,REFRACTION OF LIGHT,142-159
9,Ten,STATICAL ELECTRICITY,160-174


In [18]:
# Function to clean up columns: remove empty columns and make names unique
def clean_dataframe(df):
    # Remove empty or None columns
    df = df.loc[:, df.columns.notnull()]  # Keep columns that are not None
    df = df.loc[:, df.columns != '']  # Keep columns that are not empty strings
    
    # Check for duplicates
    if not df.columns.is_unique:
        print(f"Duplicate columns found in DataFrame:\n{df.columns[df.columns.duplicated()]}")
        
    # Make columns unique
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():  # Find duplicates
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    
    return df

In [19]:
# Function to ensure unique column names
def make_columns_unique(df):
    if not df.columns.is_unique:
        print(f"Duplicate columns found in DataFrame:\n{df.columns[df.columns.duplicated()]}")
        
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():  # Find duplicates
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

In [20]:
# Convert DataFrames to JSON
table_jsons = []
for df in table_dfs:
    df_cleaned = clean_dataframe(df)  # Clean the DataFrame
    print(f"Unique columns for cleaned DataFrame:\n{df_cleaned.columns}")
    # Convert to JSON only if the DataFrame is not empty
    if not df_cleaned.empty:
        json_str = df_cleaned.to_json(orient='records')
        table_jsons.append(json_str)  # Append only non-empty JSON strings
    # table_jsons.append(df_cleaned.to_json(orient='records'))

Unique columns for cleaned DataFrame:
Index(['Chapter', 'Subject', 'Page No'], dtype='object')
Unique columns for cleaned DataFrame:
Index([], dtype='object')
Unique columns for cleaned DataFrame:
Index(['SL No', 'Name of Physical Quantities', 'Symbol of\nquantities',
       'SI Unit', 'Symbol\nfor unit'],
      dtype='object')
Unique columns for cleaned DataFrame:
Index(['desi', '10-1', 'd', '1 deci ohm = 1 dΩ = 10-1Ω'], dtype='object')
Unique columns for cleaned DataFrame:
Index(['Rectangular\nbody’s', 'Number of\nobservation',
       'main\nscale\nreading\nM (cm)', 'Vernier\nsuper\nimposition\nV',
       'Vernier\nconstant\nVC (cm)', 'Reading\nM + V x\nVC\n(cm)',
       'Mean\nreading\n(cm)'],
      dtype='object')
Unique columns for cleaned DataFrame:
Index(['Number\nof\nobservati\non', 'Linear\nscale\nreadin\ng L\n(mm)',
       'Number\nof\ndivisions\nin the\ncircular\nscale C',
       'Least\ncount LC\n(mm)', 'Diameter\nd = L + C x LC\n(mm)',
       'Average\ndiameter\nd = d ±e\n

In [21]:
for i in table_jsons: print(i)

[{"Chapter":"One","Subject":"PHYSICAL QUANTITIES AND MEASUREMENT","Page No":"1-25"},{"Chapter":"Two","Subject":"MOTION","Page No":"26-47"},{"Chapter":"Three","Subject":"FORCE","Page No":"48-67"},{"Chapter":"Four","Subject":"WORK, POWER AND ENERGY","Page No":"68-86"},{"Chapter":"Five","Subject":"PRESSURE AND STATES OF MATTER","Page No":"87-99"},{"Chapter":"Six","Subject":"EFFECT OF HEAT ON SUBSTANCES","Page No":"100-113"},{"Chapter":"Seven","Subject":"WAVES AND SOUND","Page No":"114-125"},{"Chapter":"Eight","Subject":"REFLECTION OF LIGHT","Page No":"126-141"},{"Chapter":"Nine","Subject":"REFRACTION OF LIGHT","Page No":"142-159"},{"Chapter":"Ten","Subject":"STATICAL ELECTRICITY","Page No":"160-174"},{"Chapter":"Eleven","Subject":"CURRENT ELECTRICITY","Page No":"175-197"},{"Chapter":"Twelve","Subject":"MAGNETIC EFFECT OF CURRENT","Page No":"198-208"},{"Chapter":"Thirteen","Subject":"MODERN PHYSICS AND ELECTRONICS","Page No":"209-226"},{"Chapter":"Fourteen","Subject":"PHYSICS TO SAVE LIFE"

In [22]:
len(table_jsons)

43

In [23]:
# Create documents for the tables

# table_data = [Document(page_content=text, metadata={'source': 'table'}) for text in table_jsons]
table_data = [Document(page_content=text, metadata={'source': 'table', 'page': i, 'type': "table"}) for i, text in enumerate (table_jsons)]

# table_data = [
#     {"page_content": text, "metadata": {"type": "table", "source": "table", "page": i}} 
#     for i, text in enumerate(table_jsons)
# ]

In [24]:
table_data[:5]

[Document(metadata={'source': 'table', 'page': 0, 'type': 'table'}, page_content='[{"Chapter":"One","Subject":"PHYSICAL QUANTITIES AND MEASUREMENT","Page No":"1-25"},{"Chapter":"Two","Subject":"MOTION","Page No":"26-47"},{"Chapter":"Three","Subject":"FORCE","Page No":"48-67"},{"Chapter":"Four","Subject":"WORK, POWER AND ENERGY","Page No":"68-86"},{"Chapter":"Five","Subject":"PRESSURE AND STATES OF MATTER","Page No":"87-99"},{"Chapter":"Six","Subject":"EFFECT OF HEAT ON SUBSTANCES","Page No":"100-113"},{"Chapter":"Seven","Subject":"WAVES AND SOUND","Page No":"114-125"},{"Chapter":"Eight","Subject":"REFLECTION OF LIGHT","Page No":"126-141"},{"Chapter":"Nine","Subject":"REFRACTION OF LIGHT","Page No":"142-159"},{"Chapter":"Ten","Subject":"STATICAL ELECTRICITY","Page No":"160-174"},{"Chapter":"Eleven","Subject":"CURRENT ELECTRICITY","Page No":"175-197"},{"Chapter":"Twelve","Subject":"MAGNETIC EFFECT OF CURRENT","Page No":"198-208"},{"Chapter":"Thirteen","Subject":"MODERN PHYSICS AND ELECTR

In [25]:
# combine table data with pdf text contents
combined_data = cleaned_data + table_data

In [26]:
# new size of data
print("size of cleaned_data:\t",len(cleaned_data))
print("size of table_data:\t",len(table_data))
print("size of combined_data:\t",len(combined_data))


size of cleaned_data:	 246
size of table_data:	 43
size of combined_data:	 289


In [27]:
combined_data[0]

Document(metadata={'source': 'phy_book.pdf', 'page': 0, 'math_expressions': ''}, page_content='')

In [28]:
table_data[-5:]

[Document(metadata={'source': 'table', 'page': 38, 'type': 'table'}, page_content='[{"Experiment:Measure the potential difference between the two terminals of a dry cell. This is":"the electromotive force. Now connect this cell to the bulb and again measure the potential"},{"Experiment:Measure the potential difference between the two terminals of a dry cell. This is":"difference between the two terminals of the cell."}]'),
 Document(metadata={'source': 'table', 'page': 39, 'type': 'table'}, page_content='[{"Material":"Silver","Resistivity (\\u2126 m)":"1.6\\u00d710-8"},{"Material":"Copper","Resistivity (\\u2126 m)":"1.7\\u00d710-8"},{"Material":"Tungsten","Resistivity (\\u2126 m)":"5.5\\u00d710-8"},{"Material":"Nichrome","Resistivity (\\u2126 m)":"100\\u00d710-8"}]'),
 Document(metadata={'source': 'table', 'page': 40, 'type': 'table'}, page_content='[{"Do yourself: If there is an electrical connection in the house you are dwelling in, then prepare":"a list of the electric devices those

In [29]:
combined_data[-5:]

[Document(metadata={'source': 'table', 'page': 38, 'type': 'table'}, page_content='[{"Experiment:Measure the potential difference between the two terminals of a dry cell. This is":"the electromotive force. Now connect this cell to the bulb and again measure the potential"},{"Experiment:Measure the potential difference between the two terminals of a dry cell. This is":"difference between the two terminals of the cell."}]'),
 Document(metadata={'source': 'table', 'page': 39, 'type': 'table'}, page_content='[{"Material":"Silver","Resistivity (\\u2126 m)":"1.6\\u00d710-8"},{"Material":"Copper","Resistivity (\\u2126 m)":"1.7\\u00d710-8"},{"Material":"Tungsten","Resistivity (\\u2126 m)":"5.5\\u00d710-8"},{"Material":"Nichrome","Resistivity (\\u2126 m)":"100\\u00d710-8"}]'),
 Document(metadata={'source': 'table', 'page': 40, 'type': 'table'}, page_content='[{"Do yourself: If there is an electrical connection in the house you are dwelling in, then prepare":"a list of the electric devices those

In [30]:
combined_data[:5]

[Document(metadata={'source': 'phy_book.pdf', 'page': 0, 'math_expressions': ''}, page_content=''),
 Document(metadata={'source': 'phy_book.pdf', 'page': 1, 'math_expressions': ''}, page_content=' Prescribed by National Curriculum and Textbook Board as a Textbook for Classes Nine- Ten from the academic Year 2013 Physics [ Classes-IX-X ] Written by Dr. Shahjahan Tapan Dr. Rana Chowdhury Dr. Ekram Ali Sheikh Dr. Rama Bijoy Sarker Edited by Dr. Ali Asgar Translated by Dr. Rama Bijoy Sarker Subed Chandra Paul Md. Khairul Alam National Curriculum and Textbook Board, Dhaka'),
 Document(metadata={'source': 'phy_book.pdf', 'page': 2, 'math_expressions': '69-70'}, page_content='Published by National Curriculum and Textbook Board 69-70, Motijheel Commercial Area, Dhaka [All rights reserved by the Publisher] Trial Edition First Publication : December 2012 Coordinated by Md. Mukhlesur Rahman Computer Compose Laser Scan Ltd. Cover Sudarshan Bachar Sujaul Abedeen Illustrator Md. Hasanul Kabir Sohag 

## Split and Chunking

In [31]:
# !pip install langchain_text_splitters

In [32]:
# Split and chunk
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    length_function = len,
)
chunked_document = text_splitter.split_documents(combined_data)

In [33]:
chunked_document[0]

Document(metadata={'source': 'phy_book.pdf', 'page': 1, 'math_expressions': ''}, page_content='Prescribed by National Curriculum and Textbook Board as a Textbook for Classes Nine- Ten from the academic Year 2013 Physics [ Classes-IX-X ] Written by Dr. Shahjahan Tapan Dr. Rana Chowdhury Dr. Ekram Ali Sheikh Dr. Rama Bijoy Sarker Edited by Dr. Ali Asgar Translated by Dr. Rama Bijoy Sarker Subed Chandra Paul Md. Khairul Alam National Curriculum and Textbook Board, Dhaka')

In [34]:
print(chunked_document[0].page_content)

Prescribed by National Curriculum and Textbook Board as a Textbook for Classes Nine- Ten from the academic Year 2013 Physics [ Classes-IX-X ] Written by Dr. Shahjahan Tapan Dr. Rana Chowdhury Dr. Ekram Ali Sheikh Dr. Rama Bijoy Sarker Edited by Dr. Ali Asgar Translated by Dr. Rama Bijoy Sarker Subed Chandra Paul Md. Khairul Alam National Curriculum and Textbook Board, Dhaka


## Filter Metadata before storing in ChromaDB

In [None]:
# def filter_complex_metadata(metadata):
#     """
#     Filter out complex metadata values. Only keep str, int, float, or bool values.
#     """
#     filtered_metadata = {}
#     for key, value in metadata.items():
#         if isinstance(value, (str, int, float, bool)):
#             filtered_metadata[key] = value
#         else:
#             filtered_metadata[key] = str(value)  # Convert complex types to string
#     return filtered_metadata

# # Apply metadata filtering
# filtered_metadatas = [filter_complex_metadata(doc.metadata) for doc in chunked_document]


## Vector Embedding

In [35]:
# Embed the chunked document using OllamaEmbeddings (nomic-embed-text)
from langchain_ollama import OllamaEmbeddings
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

In [35]:
# # Embed the chunked document using OllamaEmbeddings (nomic-embed-text)

# from langchain_community.embeddings import OllamaEmbeddings
# embedding = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)

# # embedded_texts = embedding.embed_documents(chunked_document[x].page_content for x in chunked_document[len(chunked_document)-1])

  embedding = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)


In [None]:
# # # Embed the chunked document using SentenceTransformer (all-MiniLM-L6-v2)
# from langchain_huggingface import HuggingFaceEmbeddings
# embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# # embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [39]:
# # Example usage
# sentence = "This is a test sentence."
# embedding = model.encode(sentence)

# print("Sentence embedding:", embedding)

## Store in ChromaDB

In [None]:
# Create the Chroma vector store
from langchain_chroma.vectorstores import Chroma
try:
    vector_db = Chroma.from_documents(
        documents=chunked_document,
        embedding=embedding_model,
        collection_name="local-rag",
        persist_directory="./db/db_nomic"
    )
    print("Embedded Documents stored in ChromaDB successfully!")
except Exception as e:
    print("Error Occured:\n"+e)
