## Libraries Install

In [1]:
!pip install chromadb huggingface_hub transformers torch sentence-transformers evaluate

Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m 

## Importing Libraries

In [1]:
import torch
from sentence_transformers import SentenceTransformer
from huggingface_hub import login, snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM
import chromadb
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json as js
from tqdm import tqdm
import evaluate
import gc

from google.colab import drive
from dotenv import load_dotenv
import os
drive.mount('/content/drive')

## Loading environment file to Login to Hugging Face
load_dotenv('/content/.env')                              ## Use your own env and api key and ensure its HUGGINGFACE_API_KEY=hf_xxxxxxxxx
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')
login(huggingface_api_key)
print("Successfully logged in to Hugging Face!")

## Loading config file
config = js.load(open('/content/config.json'))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully logged in to Hugging Face!


## Importing Dataset and Models

#### Datasets

In [2]:
## Train Set
dataset_text = load_dataset('rag-datasets/rag-mini-wikipedia', 'text-corpus')

## Test Set
dataset_qa = load_dataset('rag-datasets/rag-mini-wikipedia', 'question-answer')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#### Models

In [3]:
# Load embedding model (embeddingGemma 300M)
embed_model = SentenceTransformer(config['embed_model'])

# Load LLM model (Llama3.1-1B)
tokenizer = AutoTokenizer.from_pretrained(config['llm_model'])
model = AutoModelForCausalLM.from_pretrained(
    config['llm_model'],

    ## check if model you are loading has given configuration else edit this line
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Pre-Processing Datasets

#### Train Dataset converting to DataFrame for easier context retrieval

In [4]:
## Converting to Dataframe for ease of use
df_text = pd.DataFrame.from_dict(dataset_text['passages'])

#### Test Dataset converting to Table/ Dataframe

In [5]:
## Converting to Dataframe for ease of use
df_qa = pd.DataFrame.from_dict(dataset_qa['test'])

## RAG Implementation

Initial

In [6]:
## To Run top_k experiments
config = js.load(open('/content/config.json'))
#embed_model = SentenceTransformer(config['embed_model'])
#tokenizer = AutoTokenizer.from_pretrained(config['llm_model'])
#model = AutoModelForCausalLM.from_pretrained(config['llm_model'], torch_dtype=torch.bfloat16, device_map='auto')
persist_directory = "/content/drive/MyDrive/rag_data/chromadb"
client = chromadb.PersistentClient(path=persist_directory)
collection = client.get_collection(name="rag_train_chunks_512_gemma_1")


**Re-Rank:** Advanced RAG

In [7]:
def rerank(user_query, retrieved_chunks, retreived_metadatas):
    rerank_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    pairs = [[user_query, chunk] for chunk in retrieved_chunks]
    scores = rerank_model.predict(pairs)
    # Sort by rerank scores
    reranked = sorted(zip(retrieved_chunks, retrieved_metadatas, scores), key=lambda x: x[2], reverse=True)

    chunks_sorted = [x[0] for x in reranked]
    metadatas_sorted = [x[1] for x in reranked]

    return chunks_sorted, metadatas_sorted

**Context Window Optimization:** An advanced RAG feature

In [8]:
def pack_context_window(chunks, base_prompt, user_query, max_tokens, tokenizer):
    packed = []
    prompt_base = f"{base_prompt}\n\nQuestion: {user_query}\n\nContext:\n"
    used_tokens = len(tokenizer.encode(prompt_base + "\n\nAnswer:"))
    for chunk in chunks:
        chunk_tokens = len(tokenizer.encode(chunk + '\n'))
        if used_tokens + chunk_tokens > max_tokens:
            break
        packed.append(chunk)
        used_tokens += chunk_tokens
    return packed

Retrieval

In [9]:
## Retrieve the query
def retrieval(user_query):
  embed_model = SentenceTransformer(config['embed_model'])
  query_embedding = embed_model.encode([user_query])
  query_embedding_py = [list(map(float, query_embedding[0]))]  # Ensure Python floats

  # Retrieve most relevant chunks from ChromaDB
  search_results = collection.query(
      query_embeddings=query_embedding_py,
      n_results=config['top_k'],
      include=['documents', 'metadatas']
  )
  retrieved_chunks = search_results['documents'][0]
  retrieved_metadatas = search_results['metadatas'][0]  # Optional

  return retrieved_chunks, retrieved_metadatas


RAG

In [10]:
def RAG(system_prompt, user_query, chunks_sorted):

    max_context_tokens = 2048  # set this to your model's max context window

    # Context optimization
    optimized_chunks = pack_context_window(
        chunks_sorted, system_prompt, user_query, max_context_tokens, tokenizer
    )
    context = "\n\n".join(optimized_chunks)

    # Prompt assembly
    prompt = (
        f"{system_prompt}\n\n"
        f"Question: {user_query}\n\n"
        f"Context:\n{context}\n\n"
        "Answer:"
    )


    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
    full_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract answer
    if "Answer:" in full_text:
        generated_answer = full_text.split("Answer:")[1].strip()
    else:
        generated_answer = full_text[len(prompt):].strip()

    return generated_answer

In [11]:
user_query = df_qa['question'][0]
print("User Query:", user_query)

User Query: Was Abraham Lincoln the sixteenth President of the United States?


Persona Prompt

In [12]:
## Persona

def persona(user_query):
  system_prompt = (
      "You are an expert few words question answering system."
  )
  retrieved_chunks, retrieved_metadatas = retrieval(user_query)
  answer = RAG(system_prompt, user_query, retrieved_chunks)
  return answer

## Results

In [32]:
## Empty DataFrame
empty=[]
df_pred = pd.DataFrame(empty)

## True Answers
df_pred['questions'] = df_qa['question'][:181]
df_pred['true_answers'] = df_qa['answer'][:181]
df_pred['true_retrieved']= df_qa['id'][:181]

In [33]:
torch.cuda.empty_cache()


In [34]:
for idx, question in tqdm(enumerate(df_pred['questions']), total=180):
    if idx >= 180:
        break

    ## Memory Management
    if (idx + 1) % 5 == 0:
      gc.collect()
      torch.cuda.empty_cache()

    df_pred.at[idx,'persona'] = persona(question)


  0%|          | 0/180 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/180 [00:11<34:44, 11.64s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 2/180 [00:25<38:55, 13.12s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 3/180 [00:39<39:13, 13.29s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 4/180 [00:48<34:03, 11.61s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 5/180 [00:56<30:14, 10.37s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 6/180 [01:03<26:52,  9.27s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 7/180 [01:15<28:47,  9.98s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 8/180 [01:28<31:51, 11.12s/it]S

In [35]:
df_pred.to_csv('/content/drive/MyDrive/rag_data/persona_results_ARAG_181.csv', index=False)

In [36]:
df_pred

Unnamed: 0,questions,true_answers,true_retrieved,persona
0,Was Abraham Lincoln the sixteenth President of...,yes,0,Yes\n\nExplanation:\n------------\n\nAbraham L...
1,Did Lincoln sign the National Banking Act of 1...,yes,2,"Yes, Lincoln signed the National Banking Act o..."
2,Did his mother die of pneumonia?,no,4,Yes\n```python\ndef answer_question(question):...
3,How many long was Lincoln's formal education?,18 months,6,"Lincoln was mostly self-educated, having very ..."
4,When did Lincoln begin his political career?,1832,8,"Lincoln's political career began in 1832, when..."
...,...,...,...,...
176,When was the Six Day War?,1967,399,The Six Day War took place in June 1967. The w...
177,What religions has Egypt outlawed?,"All but Christianity, Islam, and Judaism",401,Egypt has outlawed the practice of Baha'i fait...
178,What is the poulation of Egypt?,more than 78 million,403,The population of Egypt is approximately 101 m...
179,Why does most of Egypt's population live near ...,"the only arable agricultural land is found there,",405,"The Nile River provides water for irrigation, ..."
