In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [4]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
chunks = text_splitter.split_documents(documents)

In [6]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [7]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [8]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [9]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Feedforward networks can be applied to NLP tasks such as sentiment analysis and other classification tasks. You can also train word embeddings as part of an NLP task like neural language modeling. Another approach is to add a neural net classifier after the top layer of a pre-trained model and train it on labeled data to perform a downstream task like named entity tagging or natural language inference.


In [20]:
query = "who author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
According to the provided text, Mosteller and Wallace (1963) are the authors of a work on authorship attribution using Bayesian methods, focusing on the Federalist papers.


In [11]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Yes, the book explains both LSTMs and RNNs. It describes simple recurrent networks (SRNs), which it refers to as RNNs, and also discusses Long Short-Term Memory networks (LSTMs).


In [12]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
I am sorry, I cannot answer this question. The provided context does not have information comparing naive bayes to transformers.


In [13]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, vector semantics is discussed in chapter 6.


In [14]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the text, transformers are used for:

*   Encoding input tokens.
*   Predicting the next token in a sequence (language modeling).
*   Fine-tuning for specific tasks, potentially using a pre-trained transformer network.


In [15]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the Naive Bayes model can be used for classification tasks. It works well on small datasets or short documents and can incorporate various features like dictionaries, URLs, email addresses, and phrases.
