In [16]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [17]:
load_dotenv()

True

In [18]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [19]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

In [21]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [22]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [23]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [24]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the context, neural networks are used in NLP for various tasks, including:

*   **Language Modeling:** Neural language models can be trained using feedforward networks. These models offer advantages over traditional n-gram language models.
*   **Sentiment Classification:** Neural networks, with the addition of a hidden layer, can be used for sentiment classification by representing non-linear interactions between features.
*   **Word Embeddings:** Training word embeddings can be done as part of an NLP task, such as neural language modeling.


In [25]:
query = "who author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
The book is "Speech and Language Processing" and it is authored by Daniel Jurafsky & James H. Martin.


In [26]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Yes, the book explains both LSTMs (Long Short-Term Memory networks) and RNNs (Recurrent Neural Networks).


In [27]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the context, discriminative classifiers like logistic regression are often more accurate and hence more commonly used than generative classifiers like Naive Bayes. The transformer architecture is introduced as a component for language modeling.


In [28]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, the chapter discussing vector semantic models is Chapter 6.


In [29]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the text, the transformer architecture is used for language modeling.


In [30]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the provided text, Naive Bayes is used for:

*   **Spam detection:** Deciding if an email is spam.
