In [41]:
import os
import requests
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter  
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np

In [22]:
load_dotenv()

def load_pdf(file_path):
    loader = PyPDFDirectoryLoader(file_path)
    documents = loader.load()
    return documents

In [None]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [45]:

pdf_path = "./pdf"
extracted_data = load_pdf(pdf_path)
print(extracted_data)


[Document(metadata={'source': 'pdf\\ipc (1).pdf', 'page': 0}, page_content='1 \n \nTHE INDIAN PENAL CODE \n___________ \nARRANGEMENT OF SECTIONS  \n__________ \nCHAPTER I  \nINTRODUCTION  \nPREAMBLE \nSECTIONS \n1. Title and extent of operation of the Code.  \n2. Punishment of offences committed within India.  \n3. Punishment of offences committed beyond, but which by law may be tried within, India. \n4. Extension of Code to extra-territorial offences. \n5. Certain laws not to be affected by this Act. \nCHAPTER II \nGENERAL EXPLANATIONS \n6. Definitions in the Code to be understood subject to exceptions.  \n7. Sense of expression once explained.  \n8. Gender. \n9. Number.  \n10. “Man”.  “Woman”.  \n11. “Person”. \n12. “Public”.  \n13. [Omitted .]. \n14. “Servant of Government”.  \n15. [Repealed. ]. \n16. [Repealed .] . \n17. “Government”.  \n18. “India”.  \n19. “Judge”.  \n20. “Court of Justice”.  \n21. “Public servant”.  \n22. “Moveable property”.  \n23. “Wrongful gain”. \n“Wrongful l

In [31]:
text_chunks = text_split(extracted_data)
print(text_chunks)

[Document(metadata={'source': 'pdf\\ipc (1).pdf', 'page': 0}, page_content='1 \n \nTHE INDIAN PENAL CODE \n___________ \nARRANGEMENT OF SECTIONS  \n__________ \nCHAPTER I  \nINTRODUCTION  \nPREAMBLE \nSECTIONS \n1. Title and extent of operation of the Code.  \n2. Punishment of offences committed within India.  \n3. Punishment of offences committed beyond, but which by law may be tried within, India. \n4. Extension of Code to extra-territorial offences. \n5. Certain laws not to be affected by this Act. \nCHAPTER II \nGENERAL EXPLANATIONS \n6. Definitions in the Code to be understood subject to exceptions.  \n7. Sense of expression once explained.  \n8. Gender. \n9. Number.  \n10. “Man”.  “Woman”.  \n11. “Person”. \n12. “Public”.  \n13. [Omitted .]. \n14. “Servant of Government”.  \n15. [Repealed. ]. \n16. [Repealed .] . \n17. “Government”.  \n18. “India”.  \n19. “Judge”.  \n20. “Court of Justice”.  \n21. “Public servant”.  \n22. “Moveable property”.  \n23. “Wrongful gain”. \n“Wrongful l

In [35]:
def extract_page_contents(extracted_data):
    # List to hold all page contents
    page_contents = []
    
    for doc in extracted_data:
        # Append the page_content of each Document to the list
        page_contents.append(doc.page_content)
    
    return page_contents

In [37]:
page_contents = extract_page_contents(text_chunks)
print(page_contents[0:10])

['1 \n \nTHE INDIAN PENAL CODE \n___________ \nARRANGEMENT OF SECTIONS  \n__________ \nCHAPTER I  \nINTRODUCTION  \nPREAMBLE \nSECTIONS \n1. Title and extent of operation of the Code.  \n2. Punishment of offences committed within India.  \n3. Punishment of offences committed beyond, but which by law may be tried within, India. \n4. Extension of Code to extra-territorial offences. \n5. Certain laws not to be affected by this Act. \nCHAPTER II \nGENERAL EXPLANATIONS \n6. Definitions in the Code to be understood subject to exceptions.  \n7. Sense of expression once explained.  \n8. Gender. \n9. Number.  \n10. “Man”.  “Woman”.  \n11. “Person”. \n12. “Public”.  \n13. [Omitted .]. \n14. “Servant of Government”.  \n15. [Repealed. ]. \n16. [Repealed .] . \n17. “Government”.  \n18. “India”.  \n19. “Judge”.  \n20. “Court of Justice”.  \n21. “Public servant”.  \n22. “Moveable property”.  \n23. “Wrongful gain”. \n“Wrongful loss”. \nGaining wrongfully/ Losing wrongfully. \n24. “Dishonestly”.  \n25.

In [38]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
query_result = embeddings.embed_query(page_contents[0])

In [47]:
print(np.array(query_result))

[-1.69091672e-02  3.99970040e-02 -8.05532411e-02 -5.19633405e-02
 -3.13737057e-02  4.32028174e-02  6.57894313e-02 -4.65594046e-02
 -6.71432540e-02  7.97754303e-02  8.73899236e-02 -4.86644059e-02
  5.54768927e-02 -2.50696689e-02  8.28458071e-02 -1.74930617e-02
  6.95036375e-04  3.00501212e-02 -1.85118150e-02  1.72576611e-03
  6.64575994e-02  4.57484014e-02  4.15925495e-03 -4.25921753e-02
 -4.21908759e-02 -2.09387615e-02  1.82068236e-02 -4.57287505e-02
 -4.17092666e-02 -2.99207546e-04  2.47211289e-02  7.15500787e-02
  2.22695768e-02  1.03008665e-01 -4.11650352e-02  1.96075719e-02
 -4.28252807e-03  1.91219780e-03  6.70652017e-02 -1.02600917e-01
 -1.52939660e-02  1.41686611e-02  1.96191762e-02  6.61972258e-03
  3.22775654e-02 -4.43687066e-02 -5.13053499e-02 -2.14378666e-02
 -1.08533658e-01 -1.76066741e-01 -2.76466329e-02  3.05110626e-02
 -2.63785180e-02  8.25079083e-02 -3.39034647e-02 -1.76097304e-01
  2.07185708e-02 -2.92124506e-02  9.36952978e-03 -1.90902259e-02
  5.67107275e-02  1.02363