In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [4]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=40)
chunks = text_splitter.split_documents(documents)

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [7]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [8]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [9]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Neural networks in NLP can be used to learn features from data by representing words as embeddings, which is especially useful when the task places strong constraints on what makes a good representation for words. They can also be used to train language models.


In [16]:
query = "who author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
According to the provided text, Frederick Mosteller and David L. Wallace are the authors of "Inference and Disputed Authorship: The Federalist." The book was published by Springer-Verlag in 1984.


In [11]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Yes, the book explains both LSTMs (Long Short-Term Memory networks) and RNNs (Recurrent Neural Networks).


In [12]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, it seems that transformers are generally more accurate and commonly used than Naive Bayes classifiers, especially in discriminative systems. However, Naive Bayes classifiers still have a role to play.


In [13]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, vector semantics is discussed in chapter 6.2.


In [14]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
The transformer is a neural network structure that uses self-attention or multi-head attention. It helps the model learn how to build contextual representations of a token’s meaning by attending to and integrating information from surrounding tokens, conditioning on the prior context. Language models can be built out of stacks of transformer blocks.


In [15]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the Naive Bayes model can be used for classification tasks, particularly spam filtering.
