In [1]:
import os
os.chdir("../")

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def load_pdf_files(data):
    loader= DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf_files("data")

In [5]:
len(extracted_data)


104

In [6]:
extracted_data

[Document(metadata={'source': 'data\\Bio10th.pdf', 'page': 0}, page_content='Standard   X\nState Council of Educational Research  \nand Training (SCERT), Kerala \n2025\nGovernment of Kerala\nDepartment of General Education\nPrepared by\nPart    1\nBiology'),
 Document(metadata={'source': 'data\\Bio10th.pdf', 'page': 1}, page_content='State Council of Educational  Research and Training (SCERT)\nPoojappura, Thiruvananthapuram 695012, Kerala \nWebsite  :  www.scert.kerala.gov.in \ne-mail  :  scertkerala@gmail.com\nPhone  :  0471 - 2341883, Fax : 0471 - 2341869\nTypesetting  and Layout :  SCERT\nFirst Edition : 2025\nPrinted at : KBPS, Kakkanad, Kochi-30\n© Department of General Education, Government of Kerala\nTHE NATIONAL ANTHEM\nJana-gana-mana adhinayaka, jaya he\nBharatha-bhagya-vidhata\nPunjab-Sindh-Gujarat-Maratha\nDravida-Utkala-Banga\nVindhya-Himachala-Yamuna-Ganga\nUchchala-Jaladhi-taranga\nTava subha name jage,\nTava subha asisa mage,\nGahe tava jaya gatha\nJana-gana-mangala-daya

In [7]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document])-> List[Document]:
    
    minimal_docs: List[Document]=[]
    for doc in docs:
        src= doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs



In [8]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

[Document(metadata={'source': 'data\\Bio10th.pdf'}, page_content='Standard   X\nState Council of Educational Research  \nand Training (SCERT), Kerala \n2025\nGovernment of Kerala\nDepartment of General Education\nPrepared by\nPart    1\nBiology'),
 Document(metadata={'source': 'data\\Bio10th.pdf'}, page_content='State Council of Educational  Research and Training (SCERT)\nPoojappura, Thiruvananthapuram 695012, Kerala \nWebsite  :  www.scert.kerala.gov.in \ne-mail  :  scertkerala@gmail.com\nPhone  :  0471 - 2341883, Fax : 0471 - 2341869\nTypesetting  and Layout :  SCERT\nFirst Edition : 2025\nPrinted at : KBPS, Kakkanad, Kochi-30\n© Department of General Education, Government of Kerala\nTHE NATIONAL ANTHEM\nJana-gana-mana adhinayaka, jaya he\nBharatha-bhagya-vidhata\nPunjab-Sindh-Gujarat-Maratha\nDravida-Utkala-Banga\nVindhya-Himachala-Yamuna-Ganga\nUchchala-Jaladhi-taranga\nTava subha name jage,\nTava subha asisa mage,\nGahe tava jaya gatha\nJana-gana-mangala-dayaka jaya he\nBharatha-b

In [9]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [10]:
text_chunks = text_split(minimal_docs)


In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embedding():
    model_name = "BAAI/bge-small-en-v1.5"
    embeddings= HuggingFaceEmbeddings(
        model_name= model_name,
    )
    return embeddings

embedding = download_embedding()

  embeddings= HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-small-en-v1.5', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [13]:
vector=embedding.embed_query("Hello man")
vector

[-0.06688275188207626,
 -0.030052978545427322,
 0.0503443107008934,
 -0.06627766042947769,
 0.004762753378599882,
 0.00833857525140047,
 0.09488753974437714,
 0.025223558768630028,
 0.031219061464071274,
 -0.022729771211743355,
 -0.01255638524889946,
 -0.06406532227993011,
 0.018645059317350388,
 0.04669657722115517,
 0.05069791153073311,
 0.021455738693475723,
 0.028652504086494446,
 -0.026032906025648117,
 -0.09156377613544464,
 0.0002061998675344512,
 0.006920272950083017,
 0.054111648350954056,
 -0.06923215091228485,
 -0.05236978828907013,
 0.00401915842667222,
 -0.005878330208361149,
 0.03701005131006241,
 0.009353121742606163,
 -0.016724206507205963,
 -0.03490042686462402,
 -0.024659449234604836,
 0.021894190460443497,
 0.053435202687978745,
 -0.0061293477192521095,
 0.018249379470944405,
 -0.07858460396528244,
 0.026672953739762306,
 -0.03153108060359955,
 -0.054897502064704895,
 -0.017307473346590996,
 0.05085067078471184,
 -0.059643443673849106,
 0.032644279301166534,
 0.02321

In [15]:
from dotenv import load_dotenv
import os
load_dotenv()

True