In [1]:
import re
import pdfplumber
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma.vectorstores import Chroma


In [2]:
pdf_path = "phy_book_ch2.pdf"  # Path to your PDF file
if pdf_path:
    loader = PyPDFLoader(file_path=pdf_path)
    data = loader.load()
else:
    print("PDF not found")

In [3]:
print(data[0].page_content)

26  Physics 
Chapter Two  
MOTION 
 
 
[The object, that we see around us either are stationery or in motion. What do we 
actually understand by the words ``rest’’ and ``motion’’. We need different quantities regarding motion to express the characteristics of motion of a moving object. In this chapter we will discuss different quantities regarding motion, their dimensions, units, the 
relations among them etc.] 
By the end of this chapter we will be able to -  
1. Explain the rest and motion  
2. Find out the difference among different types of motion.  
3. Explain the scalar and vector quantities  
4. Analyze the relation among the quantities regarding motion  5. Explain the motion of freely falling bodies  
6. Analyze the relations among the quantities regarding motion with the help of graph 
7. Realize the effect of motion in our life    


In [4]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text

cleaned_data = []
for doc in data:
    cleaned_data.append(Document(page_content=clean_text(doc.page_content), metadata=doc.metadata))


In [5]:
patterns = [
    r'∴.*',        # Lines starting with '∴' (therefore)
    r'=[^=]*',     # Equal signs
    r'\b[^\s]+/[^\s]+\b',  # Fractions like 'distance/time'
]

In [6]:
def extract_math_expressions(text):
    math_expressions = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        math_expressions.extend(matches)
    return math_expressions

for doc in cleaned_data:
    math_exprs = extract_math_expressions(doc.page_content)
    if isinstance(math_exprs, list):
        math_exprs = "; ".join(math_exprs)
    doc.metadata['math_expressions'] = math_exprs

In [7]:
tables = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        table = page.extract_table()
        if table:
            tables.append(table)

In [8]:
table_dfs = [pd.DataFrame(table[1:], columns=table[0]) for table in tables]
table_texts = [df.to_string(index=False) for df in table_dfs]

In [9]:
table_documents = [Document(page_content=text, metadata={'source': 'table'}) for text in table_texts]

In [10]:
combined_data = cleaned_data + table_documents

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(combined_data)

In [12]:
def filter_complex_metadata(metadata):
    filtered_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            filtered_metadata[key] = value
        else:
            filtered_metadata[key] = str(value)
    return filtered_metadata

In [13]:
filtered_metadatas = [filter_complex_metadata(doc.metadata) for doc in chunks]

In [14]:
page_contents = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [16]:
embedded_texts = embedding.embed_documents(page_contents)
chunked_documents = [Document(page_content=content, metadata=meta) for content, meta in zip(page_contents, metadatas)]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [18]:
vector_db = Chroma.from_documents(
    documents=chunked_documents,
    embedding=embedding,
    collection_name="local-rag",
    persist_directory="./db_HF"  # Path to save the ChromaDB data
)

print("Embedded Documents stored in ChromaDB successfully!")

Embedded Documents stored in ChromaDB successfully!
