# Retrieval - Augmented Question Answering

In [4]:
%pip install sentence-transformers faiss-cpu transformers


Collecting sentence-transformers
  Using cached sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Using cached transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Using cached sentence_transformers-4.0.2-py3-none-any.whl (340 kB)
Using cached transformers-4.51.2-py3-none-any.whl (10.4 MB)
Using cached huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: huggingface-hub, tokenizers, transformers, sentence-transformers
Successfully installed huggingface-hub-0.30.2 sentence-transformers-4.0.2 tokenizers-0.21.1 transformers-4.51.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline


In [2]:
# Load the dataset
df = pd.read_csv("cleaned_hotel_booking.csv")

columns_to_use = ["hotel","is_canceled", "lead_time", "country","adr","total_nights"]
df = df[columns_to_use]
data_txt = df.apply(lambda row: f"""Hotel: {row.hotel},
    Canceled: {row.is_canceled},
    Lead Time: {row.lead_time},
    ADR: {row.adr},
    Nights: {row.total_nights}""",axis=1).tolist()

In [3]:
# Generating embeddings 

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(data_txt,show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')

Batches:   0%|          | 0/3731 [00:00<?, ?it/s]

In [13]:
%pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.3-cp37-abi3-win_amd64.whl.metadata (498 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-win_amd64.whl (4.1 MB)
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ----- ---------------------------------- 0.5/4.1 MB 3.4 MB/s eta 0:00:02
   ---------- ----------------------------- 1.0/4.1 MB 2.8 MB/s eta 0:00:02
   ----------------- ---------------------- 1.8/4.1 MB 3.1 MB/s eta 0:00:01
   ------------------------- -------------- 2.6/4.1 MB 3.1 MB/s eta 0:00:01
   --------------------------------- ------ 3.4/4.1 MB 3.2 MB/s eta 0:00:01
   -------------------------------------- - 3.9/4.1 MB 3.3 MB/s eta 0:00:01
   -------------------------------------- - 3.9/4.1 MB 3.3 MB/s eta 0:00:01
   ---------------------------------------- 4.1/4.1 MB 2.7 MB/s eta 0:00:00
Installing collected packages: hf-xet
Successfully installed hf-xet-1.0.3
Note: you may need to restart the kernel to use updated package

In [4]:
# Creating FAISS Index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, "faiss_index.idx")
pd.Series(data_txt).to_csv("text_chunks.csv", index=False)

In [6]:
#Retrieval and Answering
index = faiss.read_index("faiss_index.idx")
text_chunks = pd.read_csv("text_chunks.csv", header=None)[0].tolist()

query = "what is the average price of a hotel booking in Portugal?"
query_vec = model.encode([query]).astype('float32')
_, indices = index.search(query_vec, k=5)
retrieved = [text_chunks[i] for i in indices[0]]

context = "\n".join(retrieved)
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer"

In [7]:
#Using Pipeline
qa_model = pipeline(
    "text-generation",
    model="tiiuae/falcon-rw-1b",
    tokenizer="tiiuae/falcon-rw-1b",
    max_new_tokens=50
)

response = qa_model(prompt)[0]['generated_text']
print(response)


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Context:
Hotel: resort hotel,
    Canceled: 0,
    Lead Time: 163,
    ADR: 215.0,
    Nights: 4
Hotel: resort hotel,
    Canceled: 0,
    Lead Time: 301,
    ADR: 71.25,
    Nights: 13
Hotel: resort hotel,
    Canceled: 1,
    Lead Time: 28,
    ADR: 219.0,
    Nights: 3
Hotel: resort hotel,
    Canceled: 1,
    Lead Time: 149,
    ADR: 199.0,
    Nights: 7
Hotel: resort hotel,
    Canceled: 0,
    Lead Time: 14,
    ADR: 157.82,
    Nights: 2

Question: what is the average price of a hotel booking in Portugal?
Answer:
The average price of a hotel booking in Portugal is €1,842.
Question: what is the average price of a hotel booking in Portugal?
Answer:
The average price of a hotel booking in Portugal is €1,
