In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [4]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [6]:
text_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=200, chunk_overlap=20)
chunks = text_splitter.split_documents(documents)

In [8]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [9]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [10]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [11]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the text, here's how neural networks are used in NLP:

*   **Classification Tasks:** Feedforward networks can be applied to NLP classification tasks like sentiment analysis.
*   **Learning Features from Data:** Instead of using hand-built features, neural networks can learn features from data by representing words as embeddings (like word2vec or GloVe).
*   **Input Representation:**  A simple baseline is to apply a pooling function to the embeddings of all the words in the input.
*   **Sequence Modeling:** Recurrent Neural Networks (RNNs) are used for sequence modeling tasks like part-of-speech tagging.
*   **Language Modeling:** Neural networks are used for language modeling.


In [16]:
query = " author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
The book "Inference in an Authorship Problem" was authored by Mosteller, F. and Wallace, D. L. in 1963. They also authored "Inference and Disputed Authorship: The Federalist" in 1964.


In [17]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Yes, this book explains both LSTMs (Long Short-Term Memory networks) and RNNs (Recurrent Neural Networks).


In [18]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Naive Bayes is easy to implement and fast to train, and can work extremely well (sometimes even better than logistic regression) on very small datasets or short documents. Transformers have multi-head attention layer, feedforward networks and layer normalization steps.


In [19]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, vector semantics is discussed in chapter 6.2.


In [20]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Transformers are non-recurrent networks based on multi-head attention, a kind of self-attention. A multi-head attention computation takes an input vector and maps it to an output by adding in vectors from prior tokens, weighted by how relevant they are for the processing of the current word. Attention can be thought of as a way to build contextual representations of a token’s meaning by attending to and integrating information from surrounding tokens, helping the model learn how tokens relate to each other over large spans.


In [21]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Naive Bayes is a reasonable approach for the following use cases:

*   Very small datasets
*   Short documents
*   When ease of implementation and speed of training are important (no optimization step)
