## Libraries Install

In [1]:
!pip install chromadb huggingface_hub transformers torch sentence-transformers evaluate

Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m 

## Importing Libraries

In [2]:
import torch
from sentence_transformers import SentenceTransformer
from huggingface_hub import login, snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM
import chromadb
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json as js
from tqdm import tqdm
import evaluate

from google.colab import drive
from dotenv import load_dotenv
import os
drive.mount('/content/drive')

## Loading environment file to Login to Hugging Face
load_dotenv('/content/.env')                              ## Use your own env and api key and ensure its HUGGINGFACE_API_KEY=hf_xxxxxxxxx
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')
login(huggingface_api_key)
print("Successfully logged in to Hugging Face!")

## Loading config file
config = js.load(open('/content/config.json'))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully logged in to Hugging Face!


## Importing Dataset and Models

#### Datasets

In [3]:
## Train Set
dataset_text = load_dataset('rag-datasets/rag-mini-wikipedia', 'text-corpus')

## Test Set
dataset_qa = load_dataset('rag-datasets/rag-mini-wikipedia', 'question-answer')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#### Models

In [4]:
# Load embedding model (embeddingGemma 300M)
embed_model = SentenceTransformer(config['embed_model'])

# Load LLM model (Llama3.1-1B)
tokenizer = AutoTokenizer.from_pretrained(config['llm_model'])
model = AutoModelForCausalLM.from_pretrained(
    config['llm_model'],

    ## check if model you are loading has given configuration else edit this line
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Pre-Processing Datasets

#### Train Dataset converting to DataFrame for easier context retrieval

In [10]:
## Converting to Dataframe for ease of use
df_text = pd.DataFrame.from_dict(dataset_text['passages'])

#### Test Dataset converting to Table/ Dataframe

In [11]:
## Converting to Dataframe for ease of use
df_qa = pd.DataFrame.from_dict(dataset_qa['test'])

## Chunking and Embeddings

Ensures a Location is there for Storage after Embeddings

In [8]:
!mkdir /content/drive/MyDrive/rag_data
!mkdir /content/drive/MyDrive/rag_data/chromadb
#!rm -rf /content/drive/MyDrive/rag_data/chromadb

mkdir: cannot create directory ‘/content/drive/MyDrive/rag_data’: File exists
mkdir: cannot create directory ‘/content/drive/MyDrive/rag_data/chromadb’: File exists


In [24]:
config = js.load(open('/content/config.json'))

#### Chunking Part

In [25]:
def chunk_text(text, chunk_length, overlap=0):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_length
        chunks.append(text[start:end])
        if end >= len(text):
            break
        start += chunk_length - overlap
    return chunks


## Main where chunkong occurs
chunk_length = config['chunk_length']
overlap = 0  # You can set this to another value if desired

chunked_rows = []
for idx, row in df_text.iterrows():
    for i, chunk in enumerate(chunk_text(row['passage'], chunk_length, overlap)):
        chunked_rows.append({
            'chunk': chunk,
            'orig_id': row['id'],
            'chunk_id': f"{row['id']}_chunk{i}"
        })

## Store in Dataframe for Embeddings
chunked_df = pd.DataFrame(chunked_rows)


#### Embeddings Part

In [26]:
from tqdm import tqdm

bs = config['batch_size']

texts_to_embed = chunked_df['chunk'].tolist()
all_embeddings = []

for i in tqdm(range(0, len(texts_to_embed), bs)):
    batch = texts_to_embed[i:i+bs]
    batch_embeddings = embed_model.encode_document(batch)
    all_embeddings.extend(batch_embeddings)

chunked_df['embedding'] = all_embeddings


100%|██████████| 161/161 [00:12<00:00, 12.66it/s]


## Vector Store

In [28]:
persist_directory = "/content/drive/MyDrive/rag_data/chromadb"
client = chromadb.PersistentClient(path=persist_directory)


collection = client.get_or_create_collection(
    name=f"rag_train_chunks_{config['chunk_length']}_gemma_1"
)

# Convert each embedding to a list of Python floats
embeddings = [[float(val) for val in e] for e in chunked_df['embedding']]

ids = chunked_df['chunk_id'].astype(str).tolist()
documents = chunked_df['chunk'].tolist()
metadatas = chunked_df[['orig_id']].to_dict(orient='records')


batch_size = 20  # Tune as needed for memory and speed
n = len(ids)

for i in tqdm(range(0, n, batch_size)):
    batch_ids = ids[i:i+batch_size]
    batch_documents = documents[i:i+batch_size]
    batch_embeddings = embeddings[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]

    collection.add(
        documents=batch_documents,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
        ids=batch_ids
    )

print('\nSuccessfully Added all Data')


100%|██████████| 257/257 [00:38<00:00,  6.60it/s]


Successfully Added all Data





In [29]:
collections = client.list_collections()  # Returns list of Collection objects
for col in collections:
    print(col.name)


rag_train_chunks_256_LM
rag_train_chunks_512_LM
rag_train_chunks_384_gemma_1
rag_train_chunks_348_LM
rag_train_chunks_256_gemma
rag_train_chunks_384_gemma
rag_train_chunks_384_LM
rag_train_chunks_512_gemma


## RAG Implementation

 Retrieval

In [67]:
## To Run top_k experiments
config = js.load(open('/content/config.json'))
#embed_model = SentenceTransformer(config['embed_model'])
#tokenizer = AutoTokenizer.from_pretrained(config['llm_model'])
#model = AutoModelForCausalLM.from_pretrained(config['llm_model'], torch_dtype=torch.bfloat16, device_map='auto')
persist_directory = "/content/drive/MyDrive/rag_data/chromadb"
client = chromadb.PersistentClient(path=persist_directory)


In [68]:

def retrieval(user_query):

    query_embedding = embed_model.encode([user_query])
    query_embedding_py = [list(map(float, query_embedding[0]))]  # Ensure Python floats

    # Dynamically select the collection based on chunk_length
    collection_name = f"rag_train_chunks_{config['chunk_length']}_gemma_1"
    collection = client.get_or_create_collection(name=collection_name)

    search_results = collection.query(
        query_embeddings=query_embedding_py,
        n_results=config['top_k'],
        include=['documents', 'metadatas']
    )
    retrieved_chunks = search_results['documents'][0]
    retrieved_metadatas = search_results['metadatas'][0]  # Optional

    return retrieved_chunks, retrieved_metadatas


RAG

In [69]:
def RAG(system_prompt, user_query, retrieval):
  context = "\n\n".join(retrieval)
  prompt = (
      f"{system_prompt}\n\n"
      f"Context:\n{context}\n\n"
      f"Question: {user_query}\n\n"
      "Answer:"
  )



  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
  with torch.no_grad():
      output = model.generate(**inputs, max_new_tokens=200)
  full_text = tokenizer.decode(output[0], skip_special_tokens=True)

  # 5. Extract generated answer after 'Answer:' only
  if "Answer:" in full_text:
      generated_answer = full_text.split("Answer:")[1].strip()
  else:
      generated_answer = full_text[len(prompt):].strip()

  return generated_answer


In [70]:
user_query = df_qa['question'][0]
print("User Query:", user_query)

User Query: Was Abraham Lincoln the sixteenth President of the United States?


Instruct Prompt

In [71]:
## Instruct Build prompt (now with system instruction, question, and context)
def instruct(user_query):
  system_prompt = (
      "You are an Expert QnA AI."
  )

  retrieved_chunks, retrieved_metadatas = retrieval(user_query)
  answer = RAG(system_prompt, user_query, retrieved_chunks)
  return answer


Cot Prompt

In [72]:
## CoT prompt for AI system to understand answering
def CoT(user_query):
  system_prompt = (
      "You are an Expert AI who thinks."
  )

  retrieved_chunks, retrieved_metadatas = retrieval(user_query)
  answer = RAG(system_prompt, user_query, retrieved_chunks)
  return answer



Persona Prompt

In [73]:
## Persona

def persona(user_query):
  system_prompt = (
      "You are an expert few words question answering system."
  )
  retrieved_chunks, retrieved_metadatas = retrieval(user_query)
  answer = RAG(system_prompt, user_query, retrieved_chunks)
  return answer

## Evaluation_Phase1

In [74]:
## Empty DataFrame
empty=[]
df_pred = pd.DataFrame(empty)

## True Answers
df_pred['questions'] = df_qa['question'][:15]
df_pred['true_answers'] = df_qa['answer'][:15]
df_pred['true_retrieved']= df_qa['id'][:15]

In [75]:
df_pred

Unnamed: 0,questions,true_answers,true_retrieved
0,Was Abraham Lincoln the sixteenth President of...,yes,0
1,Did Lincoln sign the National Banking Act of 1...,yes,2
2,Did his mother die of pneumonia?,no,4
3,How many long was Lincoln's formal education?,18 months,6
4,When did Lincoln begin his political career?,1832,8
5,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",10
6,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,12
7,When did the Gettysburg address argue that Ame...,1776,14
8,Did Lincoln beat John C. Breckinridge in the 1...,yes,16
9,Was Abraham Lincoln the first President of the...,No,18


In [76]:
for idx, question in tqdm(enumerate(df_pred['questions']), total=len(df_pred['questions'])):
    df_pred.at[idx,'cot'] = CoT(question)
    df_pred.at[idx,'instruct'] = instruct(question)
    df_pred.at[idx,'persona'] = persona(question)


  0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 1/15 [00:20<04:52, 20.91s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 13%|█▎        | 2/15 [00:39<04:14, 19.58s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 3/15 [01:00<04:02, 20.22s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_i

In [77]:
df_pred

Unnamed: 0,questions,true_answers,true_retrieved,cot,instruct,persona
0,Was Abraham Lincoln the sixteenth President of...,yes,0,"Yes, Abraham Lincoln was indeed the sixteenth ...","Yes, Abraham Lincoln was the sixteenth Preside...",Yes.\n\nContext:\n\n\nQuestion: Was Abraham Li...
1,Did Lincoln sign the National Banking Act of 1...,yes,2,"No, Lincoln did not sign the National Banking ...","Yes, Abraham Lincoln did sign the National Ban...","Yes, Lincoln did sign the National Banking Act..."
2,Did his mother die of pneumonia?,no,4,"Yes, his mother died of pneumonia. I have anal...",No information available about the death of hi...,No. He died of pneumonia. His mother died of...
3,How many long was Lincoln's formal education?,18 months,6,Lincoln's formal education lasted for approxim...,Lincoln's formal education was relatively shor...,Lincoln's formal education lasted for 18 month...
4,When did Lincoln begin his political career?,1832,8,I'm not aware of any information that suggests...,Abraham Lincoln began his political career in ...,In 1832. He was elected to the Illinois state ...
5,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",10,The Legal Tender Act of 1862 established Unite...,The Legal Tender Act of 1862 established Unite...,The Legal Tender Act of 1862 established paper...
6,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,12,"According to historical accounts, it was a pho...",Joshua Speed suggested Lincoln grow a beard. \...,It was a woman who suggested Lincoln grow a be...
7,When did the Gettysburg address argue that Ame...,1776,14,"The Gettysburg Address, delivered by President...",The Gettysburg Address did not argue that Amer...,The Gettysburg Address did not argue that Amer...
8,Did Lincoln beat John C. Breckinridge in the 1...,yes,16,"No, Lincoln did not beat John C. Breckinridge ...","Yes, Abraham Lincoln defeated John C. Breckinr...","Yes, Lincoln won the 1860 presidential electio..."
9,Was Abraham Lincoln the first President of the...,No,18,"No, Abraham Lincoln was not the first Presiden...","No, Abraham Lincoln was not the first Presiden...","No, he was the 16th President of the United St..."


In [78]:
df_pred.to_csv(f'/content/drive/MyDrive/rag_data/predictions_{config['chunk_length']}_{config['top_k']}_prompts.csv', index=False)

Prediction list

In [79]:
predictions_persona = [
    {
        "prediction_text": str(row["persona"]),
        "id": str(idx)
    }
    for idx, row in df_pred.iterrows()
]

In [80]:
predictions_cot = [
    {
        "prediction_text": str(row["cot"]),
        "id": str(idx)
    }
    for idx, row in df_pred.iterrows()
]

In [81]:
predictions_instruct = [
    {
        "prediction_text": str(row["instruct"]),
        "id": str(idx)
    }
    for idx, row in df_pred.iterrows()
]

Answer Set

In [82]:
references = [
    {
        "answers": {
            "answer_start": [0],
            "text": [str(row["true_answers"])]
        },
        "id": str(idx)
    }
    for idx, row in df_pred.iterrows()
]

Persona F1

In [83]:
squad_metric = evaluate.load("squad")
results = squad_metric.compute(predictions=predictions_persona, references=references)
print("F1:", results["f1"])
print("Exact Match:", results["exact_match"])

F1: 5.228847065911152
Exact Match: 0.0


CoT F1

In [84]:
squad_metric = evaluate.load("squad")
results = squad_metric.compute(predictions=predictions_cot, references=references)
print("F1:", results["f1"])
print("Exact Match:", results["exact_match"])

F1: 1.6558436316721852
Exact Match: 0.0


Instruct F1

In [85]:
squad_metric = evaluate.load("squad")
results = squad_metric.compute(predictions=predictions_instruct, references=references)
print("F1:", results["f1"])
print("Exact Match:", results["exact_match"])

F1: 1.922326958600702
Exact Match: 0.0


## **Conclusion**

- Persona prompting outperforms all other types of prompting strategies, and the given would be used for building the Advanced RAG