In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [4]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [6]:
text_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=384, chunk_overlap=40)
chunks = text_splitter.split_documents(documents)

In [7]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [8]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [9]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [10]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the context you provided, here's how neural networks are used in NLP:

*   **Classification Tasks:** Neural networks can be applied to NLP classification tasks like sentiment analysis. Instead of using hand-built features, they learn features from the data by representing words as embeddings (like word2vec or GloVe).
*   **Sequence Modeling:** Neural networks are used for sequence modeling tasks like part-of-speech tagging.
*   **Language Modeling:** Neural networks can be used for language modeling.
*   **Recurrent Neural Networks (RNNs):** RNNs, including Elman networks and LSTMs, are particularly effective for language-related tasks.
*   **Feature Extraction:** Deep learning enables neural networks to learn features from data, using word embeddings as input.
*   **Combination with Embeddings:** Word embeddings (like word2vec or GloVe) are often used as input to neural networks for NLP tasks.
*   **Flexible Architectures:** By combining feedforward networks with vect

In [11]:
query = "who author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the book "Machine Learning: A Probabilistic Perspective" was written by Murphy, K. P.


In [12]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Yes, the book explains both LSTM (Long Short-Term Memory) networks and RNNs (Recurrent Neural Networks).


In [13]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Naive Bayes and Transformers are suited for different tasks. Naive Bayes is a classification algorithm, while Transformers are a type of neural network architecture often used for sequence-to-sequence tasks. Naive Bayes is easy to implement and fast to train, and can work well on very small datasets or short documents. Transformers are very modular and use multi-head attention, where each head might attend to the context for different purposes.


In [14]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, vector semantics is discussed in chapter 6.2.


In [15]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the text, transformers are used for:

*   Building language models.
*   Mapping an input vector to an output vector by adding vectors from prior tokens, weighted by how relevant they are for the processing of the current word.


In [16]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Naive Bayes is easy to implement and very fast to train, so it’s a reasonable approach to use in some situations. Naive Bayes can work extremely well (sometimes even better than logistic regression) on very small datasets or short documents.
