In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [4]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
chunks = text_splitter.split_documents(documents)

In [6]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [7]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [8]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [9]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Feedforward networks can be applied to NLP tasks. Instead of using hand-built human-engineered features as the input to a classifier, neural networks can be used. Neural networks are especially useful when the task the network is designed for (like sentiment classification, translation, or parsing) places strong constraints on what makes a good representation for words.


In [16]:
query = "who author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the book you're referring to is likely related to the work of Mosteller and Wallace (1963) on authorship attribution using Bayesian techniques. However, the text doesn't explicitly state the title of the book or who the author is.


In [11]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the context you provided, yes, the book explains both LSTMs and RNNs. It includes information on:

*   The structure of an RNN.
*   How RNNs deal with the sequential nature of language.
*   The application of RNNs to various tasks.
*   The LSTM version of RNNs.
*   The challenges of training RNNs, especially when dealing with information distant from the current point of processing.


In [12]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
The context discusses transformers, but does not mention Naive Bayes. Therefore, I cannot compare the two.


In [13]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the chapter number for vector semantics is Chapter 6.2.


In [14]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
The provided text discusses various aspects of the transformer architecture, including terminology, prenorm, and sinusoidal position embeddings. It also mentions that the original transformer had an encoder-decoder architecture. However, the text does not explicitly mention the specific use cases of transformers.


In [15]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Naive Bayes is a probabilistic classifier that makes a simplifying assumption about the independence of features. It is often used in text classification, where the order of words is ignored and the frequency of each word is used as a feature.
