In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [4]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [None]:
text_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=100, chunk_overlap=10)
chunks = text_splitter.split_documents(documents)

In [7]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [8]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [9]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [10]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Neural networks can be applied to NLP tasks like sentiment analysis using feedforward networks with hand-built features or using more complex architectures like Recurrent Neural Networks (RNNs) with word embeddings as inputs.


In [11]:
query = "who author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the book "Selected papers of J. R. Firth" was authored by J. R. Firth.


In [12]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Yes, the book explains both LSTM (Long Short-Term Memory) networks and RNNs (Recurrent Neural Networks). It introduces the RNN, discusses advanced components like stacking multiple layers and using the LSTM version, and explains how the RNN can be applied to various tasks. The book also mentions that LSTMs, rather than RNNs, have become the standard unit for any modern system that makes use of recurrent networks.


In [13]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, here's a comparison of Naive Bayes and Transformers:

*   **Naive Bayes:**

    *   Can work extremely well on very small datasets or short documents.
    *   Easy to implement and very fast to train (no optimization step).
*   **Transformers:**

    *   Based on multi-head attention.
    *   The text describes a method called LoRA (Low-Rank Adaptation) to fine-tune Transformers more efficiently.

The text also mentions that discriminative systems like logistic regression are often more accurate and more commonly used than generative classifiers like Naive Bayes. However, it also notes that Naive Bayes still has a role.

The provided text does not directly compare Naive Bayes to Transformers.


In [14]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the chapter number for vector semantics is not explicitly mentioned. However, it is discussed in section 6.7.


In [15]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, transformers are non-recurrent networks that rely on multi-head attention. They consist of transformer blocks, each containing a multi-head attention layer, feedforward networks, and layer normalization steps. The architecture processes input tokens, encodes them, passes them through stacked transformer blocks, and then uses a language model head to predict the next token.


In [16]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Naive Bayes classifiers are useful in the following cases:

*   When you have very small datasets.
*   When you have short documents.
*   When you need a classifier that is easy to implement and fast to train.
