In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from utils import clean
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
dir = "../../documents"
documents_path = os.listdir(dir)
documents_path = [f"{dir}/{file}" for file in documents_path]

In [4]:
documents = []
for file in documents_path:
    loader = PyPDFLoader(file)
    loaded_docs = loader.load()
    
    for doc in loaded_docs:
        doc.page_content = clean(doc.page_content)
    
    documents.extend(loaded_docs)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=120)
chunks = text_splitter.split_documents(documents)

In [7]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in chunks])

In [8]:
vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding_model)

In [9]:
retriever  = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.2)

rag_pipeline = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [10]:
query = "how use nural network in nlp"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Neural networks in NLP can be used in the following ways:

1.  **Sentiment Analysis:** Feedforward networks can be used for sentiment analysis by using hand-built features as input or by learning features from the data using word embeddings (like word2vec or GloVe).
2.  **Representing Input for Classification:** Representing words as embeddings and then applying a pooling function (summing or taking the mean) to the embeddings of all the words in the input text.
3.  **Language Modeling:** Using feedforward networks to look at a fixed-size window of words and making independent predictions along the way.
4.  **Autoregressive Generation:** Using RNN-based neural language models for generating text by priming the generation component with an appropriate context.
5.  **Stacked and Bidirectional RNN Architectures:** Combining feedforward networks with vectors as common inputs and outputs to create complex networks that can be treated as modules and combined in creative ways.


In [16]:
query = "author this book"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
I am a large language model, trained by Google. I can provide information to you based on the text you provide, but I am not the author of any book. 

Based on the text you provided, the following authors are mentioned:

*   Mosteller and Wallace (1963)
*   Heckerman et al. (1998)
*   Metsis et al. (2006)
*   Pang et al. (2002)
*   Wang and Manning (2012)
*   Maron (1961)
*   Minsky (1961)
*   Weizenbaum, J. (1976)
*   Zhu, Y. et al. (2015)


In [17]:
query = "this book explain the lstm and rnn"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Yes, the book explains both LSTMs (Long Short-Term Memory networks) and RNNs (Recurrent Neural Networks). It covers the basic structure of RNNs, the issues they face (like vanishing gradients), and how LSTMs address these issues with their context management mechanisms. It also discusses common applications of RNNs in NLP, including sequence labeling, sequence classification, and language modeling.


In [18]:
query = "who best naive bays or transformer"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the text, discriminative classifiers like logistic regression are often more accurate and hence more commonly used than Naive Bayes classifiers. The text does not directly compare Naive Bayes to transformers, but it does mention that transformers are used in language models.


In [19]:
query = "In this book what the chapter number for  vector semantic"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Vector semantics is covered in chapter 6.2.


In [20]:
query = "what the transformer use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Based on the text, a transformer's use case is language modeling.


In [21]:
query = "what the naive bays use case"
response = rag_pipeline.invoke(query)

print("Answer:")
print(response["result"])

Answer:
Naive Bayes classifiers are commonly used in text classification. They are also used in spam filtering.
