# RAG-based Question Answering System
  This Jupyter notebook provides a comprehensive guide to implementing a Retrieval-Augmented Generation (RAG) system for question answering. The notebook walks through the entire process, which includes loading documents, splitting them into manageable chunks, creating embeddings, and setting up a retrieval system to answer questions based on the provided documents.

  

In [1]:
pip install langchain_community langchain_openai unstructured chromadb sentence-transformers langchain_google_genai -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m38.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
pip install nltk --upgrade



## **Importing Libraries**



*   The next step involves importing the required modules to facilitate various functionalities:
*   These imports provide essential tools for document loading, text splitting, vector storage, and integration with language models.





In [3]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
import openai
import os
import shutil
import os
import shutil
import nltk
nltk.download('punkt_tab')
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import GoogleGenerativeAI
from langchain_google_genai.chat_models import ChatGoogleGenerativeAI

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## **Document Splitting**


In [4]:
loader = DirectoryLoader('/content/drive/MyDrive/1-10 sample', glob="**/*.txt")
documents = loader.load()

In [5]:
len(documents)

10

## **Document Splitting**



*   Once the documents are loaded, they are split into smaller chunks for more efficient processing:
*   This process divides the documents into manageable chunks, which is crucial for effective retrieval and processing.



In [6]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True
    )
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

Split 10 documents into 1147 chunks.


In [7]:
document = chunks[10]
print(document.page_content)
print(document.metadata)

leguminous species. Multiple sequences were identified in several family members, particularly for PsAAP2 (Cluster 3A) and PsAAP7 (Cluster 1), as previously noted. AtAAP7 has yet to be functionally characterised. In contrast to the AAPs, SUTs belong to a smaller gene family. SUTs are considered
{'source': '/content/drive/MyDrive/1-10 sample/doc07.txt', 'start_index': 1987}


## **Generate Embeddings**
Generate embeddings for the document chunks using HuggingFaceEmbeddings.

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

# Choose an appropriate model from Hugging Face model hub
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
hf = HuggingFaceEmbeddings(
    model_name=embedding_model_name
)

  hf = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
CHROMA_PATH = "chroma/tmp1"
if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

os.makedirs(CHROMA_PATH)

## **Save Chunks to Chroma**
Save the document chunks and their embeddings to a Chroma vector store.

In [11]:
embeddings = HuggingFaceEmbeddings()
db = Chroma.from_documents(
    chunks, embeddings, persist_directory=CHROMA_PATH
)
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

  embeddings = HuggingFaceEmbeddings()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saved 1147 chunks to chroma/tmp1.


  db.persist()


## **Create Prompt Template**
Create a prompt template for the QA task.

In [36]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

query_text = "Can you briefly explain american and japanese ecomnomies in 3 points?"

## **Perform Similarity Search**
Perform a similarity search on the document chunks using the query text.

In [37]:
results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.6:
    print(f"Unable to find matching results.")

results

Unable to find matching results.


[(Document(metadata={'source': '/content/drive/MyDrive/1-10 sample/doc08.txt', 'start_index': 8820}, page_content='Financial Statistics (IMF). Around the middle of 2008, the differences in the economic situations of both countries appear evident. Since the middle of the 1990s, the Japanese economy has been in recession and deflation; on the other hand, the US economy has expanded stably except for a few years.'),
  0.42066543617168484),
 (Document(metadata={'source': '/content/drive/MyDrive/1-10 sample/doc08.txt', 'start_index': 10436}, page_content='the middle of the 1990s. In the past, US overconsumption has been pointed out; on the contrary, lower Japanese consumption has been pointed out. However, in reality, the two countries have similar characteristics in consumptions. It should be noted that US overconsumption and lower levels of'),
  0.36814022124292267),
 (Document(metadata={'source': '/content/drive/MyDrive/1-10 sample/doc08.txt', 'start_index': 10633}, page_content='charact

In [38]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

Human: 
Answer the question based only on the following context:

Financial Statistics (IMF). Around the middle of 2008, the differences in the economic situations of both countries appear evident. Since the middle of the 1990s, the Japanese economy has been in recession and deflation; on the other hand, the US economy has expanded stably except for a few years.

---

the middle of the 1990s. In the past, US overconsumption has been pointed out; on the contrary, lower Japanese consumption has been pointed out. However, in reality, the two countries have similar characteristics in consumptions. It should be noted that US overconsumption and lower levels of

---

characteristics in consumptions. It should be noted that US overconsumption and lower levels of Japanese consumption were found in general. During the sample period, the US economy expanded at a steady rate; however, the Japanese economy has suffered recession and deflation. In Japan, reduced wages

---

Answer the question base

## **Generate Response using Google Generative AI**
Generate a response to the query using Google Generative AI.

In [41]:
from google.colab import userdata
api_gemini_key = userdata.get('api_gemini')

In [42]:
llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=api_gemini_key, temperature=0.2)

response_text = llm.predict(prompt)

In [43]:
response_text

'1. Since the mid-1990s, the Japanese economy has been in recession and deflation, while the US economy has expanded stably except for a few years.\n2. Despite perceptions of overconsumption in the US and lower consumption in Japan, both countries have similar consumption patterns.\n3. During the sample period, the US economy expanded steadily, while the Japanese economy suffered recession and deflation, leading to reduced wages in Japan.'

In [44]:
sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: 1. Since the mid-1990s, the Japanese economy has been in recession and deflation, while the US economy has expanded stably except for a few years.
2. Despite perceptions of overconsumption in the US and lower consumption in Japan, both countries have similar consumption patterns.
3. During the sample period, the US economy expanded steadily, while the Japanese economy suffered recession and deflation, leading to reduced wages in Japan.
Sources: ['/content/drive/MyDrive/1-10 sample/doc08.txt', '/content/drive/MyDrive/1-10 sample/doc08.txt', '/content/drive/MyDrive/1-10 sample/doc08.txt']


In [45]:
formatted_response

"Response: 1. Since the mid-1990s, the Japanese economy has been in recession and deflation, while the US economy has expanded stably except for a few years.\n2. Despite perceptions of overconsumption in the US and lower consumption in Japan, both countries have similar consumption patterns.\n3. During the sample period, the US economy expanded steadily, while the Japanese economy suffered recession and deflation, leading to reduced wages in Japan.\nSources: ['/content/drive/MyDrive/1-10 sample/doc08.txt', '/content/drive/MyDrive/1-10 sample/doc08.txt', '/content/drive/MyDrive/1-10 sample/doc08.txt']"