In [None]:
!pip install datasets
!pip install sentence-transformers ChromaDB
!pip install rank-bm25
!pip install torch transformers
!pip install huggingface_hub
!pip install langchain-community
!pip install accelerate


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

## **Adding the Imports**

In [None]:
import numpy as np
import os
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import random_split
import torch.optim as optim
import matplotlib.pyplot as plt
import logging

logger = logging.getLogger(__name__)


## **Loading the RAGBench Dataset**

In [None]:
from datasets import load_dataset

# load the full ragbench dataset
ragbench = {}
datasets = ['emanual']
# datasets = ['emanual', 'expertqa', 'finqa'] # d2
for dataset in datasets:
  ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/24.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/288k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/305k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1054 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/132 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/132 [00:00<?, ? examples/s]

In [None]:
len(ragbench['cuad']['train'])

1530

## **Chunking the Dataset**

In [None]:
# New code - 12/4 10 pm

from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/LLM-Embedder")

# Sliding window configuration
TOKEN_LIMIT = 512
SLIDING_WINDOW_OVERLAP = 100  # Overlap between consecutive chunks (in tokens)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

### **Sliding window chunking**

In [None]:
# Function for chunking with token limit and sliding window
def chunk_with_token_limit(text, token_limit, overlap):
    sentences = sent_tokenize(text)  # Split text into sentences
    chunks = []  # Store resulting chunks
    current_chunk = []  # Temporarily hold sentences for the current chunk
    current_chunk_tokens = 0  # Token count for the current chunk

    for sentence in sentences:
        # Tokenize the sentence and calculate its token count
        sentence_tokens = tokenizer.tokenize(sentence)
        num_tokens = len(sentence_tokens)

        # print(f"Tokens: {sentence_tokens[0]}")

        # If adding this sentence exceeds the token limit
        if current_chunk_tokens + num_tokens > token_limit:
            # Save the current chunk
            chunk_text = " ".join(current_chunk)
            chunks.append(chunk_text)

            # Prepare the next chunk with overlap
            overlap_tokens = tokenizer.tokenize(" ".join(current_chunk[-1:]))
            current_chunk = [sentence for sentence in current_chunk[-(overlap // len(overlap_tokens)) :]] if current_chunk else []
            current_chunk_tokens = sum(len(tokenizer.tokenize(sent)) for sent in current_chunk)

        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_chunk_tokens += num_tokens

    # Add the last chunk if it exists
    if current_chunk:
        chunk_text = " ".join(current_chunk)
        chunks.append(chunk_text)

    return chunks

### **Small to Big Chunking**

In [None]:
def small_to_big_chunking(text, token_limit):
    sentences = sent_tokenize(text)  # Start small: split into sentences
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        num_tokens = len(sentence_tokens)

        # If adding this sentence exceeds the token limit, finalize the current chunk
        if current_tokens + num_tokens > token_limit:
            chunks.append(" ".join(current_chunk))
            current_chunk = []  # Start a new chunk
            current_tokens = 0

        # Add sentence to the current chunk
        current_chunk.append(sentence)
        current_tokens += num_tokens

    # Add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


### **Document Pre-processing for chunking**

In [None]:
# Function for processing the document and adding identifiers
def process_document_with_identifiers(document):
    processed_data = []
    title_count = 0

    for section in document:
        section_chunks = []
        passage_count = ord("a")  # Initialize passage letter as 'a'

        sentences = sent_tokenize(section)
        for sentence in sentences:
            if sentence.startswith("Title:"):
                identifier = f"{title_count}a"
                chunked_texts = chunk_with_token_limit(sentence, TOKEN_LIMIT, SLIDING_WINDOW_OVERLAP)
                for chunk in chunked_texts:
                    section_chunks.append([identifier, chunk])
                title_count += 1
                passage_count = ord("a")  # Reset passage count for the next title
            elif sentence.startswith("Passage:"):
                identifier = f"{title_count - 1}{chr(passage_count)}"
                chunked_texts = chunk_with_token_limit(sentence, TOKEN_LIMIT, SLIDING_WINDOW_OVERLAP)
                for chunk in chunked_texts:
                    section_chunks.append([identifier, chunk])
                passage_count += 1
            else:
                identifier = f"{title_count - 1}{chr(passage_count)}"
                chunked_texts = chunk_with_token_limit(sentence, TOKEN_LIMIT, SLIDING_WINDOW_OVERLAP)
                for chunk in chunked_texts:
                    section_chunks.append([identifier, chunk])
                passage_count += 1

        processed_data.append(section_chunks)
    return processed_data

##Try to get tokens from GROQ api instead of using model locally (not in use)

In [None]:
import groq

# Initialize GROQ API client (adjust based on your setup)
client = groq.Client(api_key="your_api_key", endpoint="your_groq_endpoint")

# Define the model and tokenizer (example uses BERT-like tokenization)
# Assume the tokenizer is a part of your local library or the GROQ API
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("all-MiniLM-L6-v2")  # Replace with your model's tokenizer

# Input sentence or token
input_text = "Hello, Groq!"

# Tokenize the input
tokenized_input = tokenizer(input_text, return_tensors="pt")

# Prepare the data payload
payload = {
    "inputs": tokenized_input["input_ids"].tolist(),  # Tokenized IDs
    "attention_mask": tokenized_input["attention_mask"].tolist(),  # Attention mask
}

# Send the payload to GROQ API for inference
response = client.infer(model_id="your_model_id", inputs=payload)

# Extract the vector embeddings
# Assume the response contains a field "embeddings" with token vectors
embeddings = response.get("embeddings")

# Display the vector for each token
for i, token in enumerate(tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])):
    print(f"Token: {token}, Vector: {embeddings[i]}")


ModuleNotFoundError: No module named 'groq'

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## **Generate Embeddings**

In [None]:
# Code on 12/4, 10 pm

from datasets import load_dataset
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


# Insert embeddings into chromadb in batches as we are seeing memory issues when doing it at once.
#datasets = ['covidqa']  # List of dataset names

#datasets = ['covidqa', 'cuad', 'delucionqa', 'emanual']
#datasets = ['expertqa','finqa', 'hagrid', 'hotpotqa']
#datasets = ['msmarco', 'pubmedqa','tatqa', 'techqa']

# Initialize storage for documents, IDs, and metadata
all_documents = []
all_ids = []
all_metadatas = []

# Process each dataset
doc_idx = 0  # Global document index for unique IDs
for dataset in datasets:
    data = load_dataset("rungalileo/ragbench", dataset, split="train")

    for idx, row in tqdm(enumerate(data), desc=f"Processing {dataset}"):
        # Extract document text
        doc_text = row.get('documents', '')

        # Skip if no documents found
        if not doc_text:
            continue

        # Process the document
        processed_output = process_document_with_identifiers(doc_text)

        # Populate the lists
        for section_idx, section in enumerate(processed_output):
            for item_idx, (prefix, content) in enumerate(section):
                # Add the document
                document = f"[{prefix}] {content}"
                all_documents.append(document)

                # Construct a globally unique ID
                doc_id = f"{dataset}_{doc_idx}_{section_idx}_{item_idx}"
                all_ids.append(doc_id)

                # Construct metadata
                metadata = {
                    "dataset": dataset,
                    "global_index": doc_idx,
                    "section_index": section_idx,
                    "item_index": item_idx,
                    "prefix": prefix,
                    "type": "Title" if prefix.endswith("a") else "Passage",
                }
                all_metadatas.append(metadata)

        doc_idx += 1  # Increment global document index

# Step 4: Generate Embeddings
#embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Pretrained sentence transformer
embedder = SentenceTransformer("BAAI/LLM-Embedder")  # Pretrained sentence transformer
batch_size = 2500  # Adjust based on available memory

# Generate embeddings in batches
all_embeddings = []
for i in tqdm(range(0, len(all_documents), batch_size), desc="Generating embeddings"):
    batch_docs = all_documents[i:i + batch_size]
    batch_embeddings = embedder.encode(batch_docs, show_progress_bar=True)
    all_embeddings.extend(batch_embeddings)


Processing emanual: 1054it [00:05, 189.35it/s]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:   8%|▊         | 1/13 [00:05<01:05,  5.50s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  15%|█▌        | 2/13 [00:09<00:51,  4.68s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  23%|██▎       | 3/13 [00:15<00:50,  5.03s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  31%|███       | 4/13 [00:19<00:44,  4.89s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  38%|███▊      | 5/13 [00:23<00:36,  4.55s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  46%|████▌     | 6/13 [00:27<00:30,  4.37s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  54%|█████▍    | 7/13 [00:32<00:26,  4.41s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  62%|██████▏   | 8/13 [00:36<00:21,  4.37s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  69%|██████▉   | 9/13 [00:40<00:17,  4.27s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  77%|███████▋  | 10/13 [00:44<00:12,  4.27s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  85%|████████▍ | 11/13 [00:49<00:08,  4.39s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  92%|█████████▏| 12/13 [00:53<00:04,  4.30s/it]

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Generating embeddings: 100%|██████████| 13/13 [00:55<00:00,  4.24s/it]


# **Store Embeddings into a vector DB**

In [None]:
!pip install chromadb



In [None]:
import chromadb

client = chromadb.PersistentClient(path="./content/rag_chroma_db_d2")

collection = client.create_collection(name="ragbench_collection_d2_v0.1")


for i in tqdm(range(0, len(all_documents), batch_size), desc="Adding data to ChromaDB"):
    batch_embeddings = all_embeddings[i:i + batch_size]
    batch_metadatas = all_metadatas[i:i + batch_size]
    batch_documents = all_documents[i:i + batch_size]
    batch_ids = all_ids[i:i + batch_size]

    # Add the batch to the ChromaDB collection
    collection.add(
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
        documents=batch_documents,
        ids=batch_ids
    )


Adding data to ChromaDB: 100%|██████████| 13/13 [01:26<00:00,  6.68s/it]


## **Verifying retrival logic for the Relevant documents**

In [None]:
question = "How do I select Natural mode?"
query_embedding = embedder.encode(question).tolist()

# Search for relevant chunks in the vector database
results = collection.query(query_embeddings=[query_embedding], n_results=10)
for doc in results["documents"][0]:
    print("Relevant Docs:\n", doc)

Relevant Docs:
 [-1m] Setting up the Ambient Mode details In the Ambient Mode browser screen, move the focus to , and then press the Select button.
Relevant Docs:
 [-1m] Setting up the Ambient Mode details In the Ambient Mode browser screen, move the focus to , and then press the Select button.
Relevant Docs:
 [-1m] Setting up the Ambient Mode details In the Ambient Mode browser screen, move the focus to , and then press the Select button.
Relevant Docs:
 [-1m] Setting up the Ambient Mode details In the Ambient Mode browser screen, move the focus to , and then press the Select button.
Relevant Docs:
 [-1m] Setting up the Ambient Mode details In the Ambient Mode browser screen, move the focus to , and then press the Select button.
Relevant Docs:
 [-1m] Setting up the Ambient Mode details In the Ambient Mode browser screen, move the focus to , and then press the Select button.
Relevant Docs:
 [-1m] Setting up the Ambient Mode details In the Ambient Mode browser screen, move the focus to 

## **Retrival Logic 01 (not in use)**

In [None]:
# Step 6: Query the Retriever
from rank_bm25 import BM25Okapi

question = "What is the effect of Glycyrrhizin in viral infections?"
query_embedding = embedder.encode(question).tolist()

# Search for relevant chunks in the vector database
results = collection.query(query_embeddings=[query_embedding], n_results=10)
for doc in results["documents"][0]:
    print("Relevant Docs:\n", doc)

# we have to rank these responses
# use BM25 algo

documents = results['documents'][0]

# Tokenize the documents
tokenized_docs = [doc.lower().split() for doc in documents]

# Create BM25 object
bm25 = BM25Okapi(tokenized_docs)

# Query
tokenized_query = question.lower().split()

# Get scores for the query
doc_scores = bm25.get_scores(tokenized_query)

# Get top documents
ranked_docs = sorted(enumerate(doc_scores), key=lambda x: x[1], reverse=True)
for doc_id, score in ranked_docs:
    print(f"Score: {score:.3f} | Document: {documents[doc_id]}")


Relevant Docs:
 [2d] However, these effects were observed only in concentrations $200 mg/ml when glycyrrhizin was added during the virus adsorption period.
Relevant Docs:
 [0a] 8-~° Glycyrrhizin also has been found to inhibit virus growth in mice infected with influenza viruses and to improve outcomes.
Relevant Docs:
 [2c] Moreover, glycyrrhizin was shown to influence seasonal influenza A virus replication through interaction with the cell membrane .
Relevant Docs:
 [1b] Also, glycyrrhizin is a known antioxidant and antioxidants were already shown to interfere with influenza A virus replication and virus-induced pro-inflammatory responses .
Relevant Docs:
 [2a] Title: Glycyrrhizin Exerts Antioxidative Effects in H5N1 Influenza A Virus-Infected Cells and Inhibits Virus Replication and Pro-Inflammatory Gene Expression
Passage: Experimental results suggested that glycyrrhizin might be able to affect seasonal influenza A virus disease by antiviral and immunomodulatory effects .
Relevant Do

## **Hugging Face Integration (not in use) **

In [None]:
from huggingface_hub import login
login(token = "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")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import pipeline
import torch

# Load the model and tokenizer
#model_name = "meta-llama/Llama-2-7b-hf" # for larger context length old one
model_name = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenizer.model_max_length = 4000  # Setting the tokenizer's context length

# Load the Hugging Face pipeline
hf_pipeline = pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,  # Using float16 for efficiency
#    max_length = 2000
    max_new_tokens=650,
    truncation=True,
    device_map="auto"
)

# Wrap in a LangChain-compatible LLM
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Define a prompt template for RAG
template = """
Please provide a response to the query below, strictly adhering to the
information presented in the following documents.
Do not generate any text beyond what is explicitly stated in the documents.

Context: {context}

Question: {query}

Answer:
"""
# documents = results['documents'][0]
prompt_template = PromptTemplate(input_variables=["context", "query"], template=template)
llm_chain = LLMChain(llm=llm, prompt=prompt_template)
# filled_prompt = prompt_template.format(context=documents, query=question)


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

KeyboardInterrupt: 

## **Llama Integration using Hugging face Endpoint (not in use)**

In [None]:
import time
import requests

# Hugging Face endpoint details
model_name = "meta-llama/Llama-3.2-3B"  # Replace with your model name
API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
API_TOKEN = "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"  # Replace with your Hugging Face API token
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query_hf_endpoint(prompt, max_new_tokens=1000):
    """
    Send a request to the Hugging Face API for text generation.
    """
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_new_tokens,
            "temperature": 0.7,
            "top_p": 0.9,
            "truncation": True,
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Request failed with status code {response.status_code}: {response.text}")

def query_hf_endpoint_with_retry(prompt, max_new_tokens=650, retries=5, wait_time=10):
    """
    Attempt to query the Hugging Face API, with retries and exponential backoff
    if the model is loading.
    """
    for attempt in range(retries):
        try:
            return query_hf_endpoint(filled_prompt, max_new_tokens)
        except Exception as e:
            if "Model is currently loading" in str(e) and attempt < retries - 1:
                print(f"Error from model: {str(e)}")
                print(f"Model is loading. Retrying in {wait_time} seconds... (Attempt {attempt + 1} of {retries})")
                time.sleep(wait_time)
                wait_time *= 2  # Exponential backoff
            else:
                raise

def check_model_status():
    """
    Check if the model is ready using the Hugging Face status endpoint.
    """
    status_url = f"https://api-inference.huggingface.co/status/{model_name}"
    response = requests.get(status_url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Status check failed: {response.status_code} - {response.text}")



## **Retrival of Relevant Chunks**

In [None]:
# Function to retrieve relevant chunks
def retrieve_docs(query, top_k=5):
    # Generate embedding for the query
    query_embedding = embedder.encode(query).tolist()
    # Perform vector search to find relevant chunks
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    # Extract the retrieved chunks
    chunks = results["documents"]
    # should sort and push context - but later
    return chunks

## **RAG process: retrieval + generation**

In [None]:
# Full RAG process: retrieval + generation

def retrieval_augmented_generation(query):
    # Step 1: Retrieve relevant chunks
    retrieved_chunks = retrieve_docs(query)

    # Flatten the list if necessary
    if any(isinstance(chunk, list) for chunk in retrieved_chunks):
        retrieved_chunks = [item for sublist in retrieved_chunks for item in (sublist if isinstance(sublist, list) else [sublist])]

    context = "\n".join(retrieved_chunks)  # Join the flattened list
    print("Context length:", len(context))
    print("Context preview:", context[:500])

    # Step 2: Generate answer using LLaMA model with retrieved context
    print("Query >>>>> ", query)
    print("context >>>>> ", context)

    try:
         response = llm_chain.run({"context": context, "query": query})
         print("Raw LLM Response >>>>>", response)
         # Check model status before querying
        #  status = check_model_status()
        #  print(f"Model status: >>>>>>>>> {status}")
        #  if status.get("loaded", False):
        #     response = query_hf_endpoint_with_retry(filled_prompt)
        #     print(response)
        #  else:
        #     print(">>>>>>>>Model is not ready. Please wait and try again later.<<<<<<<<<<")

         # Extract the answer from the response
         answer_start = response.find("Answer:")
         if answer_start != -1:
            answer = response[answer_start + len("Answer:"):].strip()
         else:
            answer = response.strip()

         print("Cleaned Answer >>>>>", answer)

    except Exception as e:
       print(">>>>>>>>>Error during LLM Chain run:<<<<<<<", e)
       answer = None

    return context, answer

In [None]:
def wait_for_model_to_load():
    while True:
        status = check_model_status()
        print(f"Model status: {status}")
        if status.get("loaded"):
            print("Model is loaded and ready for inference!")
            break
        else:
            print("Model is not loaded yet. Retrying in 30 seconds...")
            time.sleep(30)

wait_for_model_to_load()

Model status: {'loaded': False, 'state': 'Loadable', 'compute_type': 'cpu', 'framework': 'text-generation-inference'}
Model is not loaded yet. Retrying in 30 seconds...


KeyboardInterrupt: 

# **LLM Inference with groq**

In [None]:
! pip install groq
! pip install -q langchain langchain-groq

Collecting groq
  Downloading groq-0.14.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.14.0-py3-none-any.whl (109 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/109.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.5/109.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.14.0


In [None]:
! export GROQ_API_KEY="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"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import pipeline
import torch
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

In [None]:
query = "How do I change the name of the TV on a network?"

In [None]:

chunks = retrieve_docs(query)

# Flatten the list if necessary
if any(isinstance(chunk, list) for chunk in chunks):
    chunks = [item for sublist in chunks for item in (sublist if isinstance(sublist, list) else [sublist])]

context = "\n".join(chunks)

print("Query >>>>> ", query)
print("context >>>>> ", context)

chat = ChatGroq(temperature=0.3, groq_api_key="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", model_name="llama3-8b-8192")

prompt=ChatPromptTemplate.from_template(
"""
Please provide a response to the query below, strictly adhering to the
information presented in the following documents.
Do not generate any text beyond what is explicitly stated in the documents.

Context: {context}

Question: {query}

Answer:
"""
)

chain = prompt | chat

groq_response = chain.invoke({"context": context, "query": query})

print("groq_response>>>>>>>>> ",groq_response)

answer = groq_response

Query >>>>>  How do I change the name of the TV on a network?
context >>>>>  [-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1a] Changing the name of the TV on a network.
groq_response>>>>>>>>>  content='According to the documents, to change the name of the TV on a network, you can:\n\n"Try Now You can change the name of the TV on the network."' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 35, 'prompt_tokens': 175, 'total_tokens': 210, 'completion_time': 0.029166667, 'prompt_time': 0.021095112, 'queue_time': 0.017219208, 'total_time': 0.050261779}, 'model_name': 'llama3-8b-81

In [None]:
def query_response_from_llm(query: str):
    chunks = retrieve_docs(query)
    # Flatten the list if necessary
    if any(isinstance(chunk, list) for chunk in chunks):
      chunks = [item for sublist in chunks for item in (sublist if isinstance(sublist, list) else [sublist])]

    context = "\n".join(chunks)

    chat = ChatGroq(temperature=0.3, groq_api_key="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", model_name="llama3-8b-8192")

    prompt=ChatPromptTemplate.from_template(
      """
      Please provide a response to the query below, strictly adhering to the
      information presented in the following documents.
      Do not generate any text beyond what is explicitly stated in the documents.

      Context: {context}

      Question: {query}

      Answer:
      """
    )

    chain = prompt | chat

    groq_response = chain.invoke({"context": context, "query": query})

    print("groq_response>>>>>>>>> ",groq_response)

    answer = groq_response
    return answer

## **USER QUERY**

In [None]:

answer = query_response_from_llm(query)
print("answer  >>after>>> ", answer)
answer_pass_llm = answer

groq_response>>>>>>>>>  content='Try Now You can change the name of the TV on the network.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 182, 'total_tokens': 197, 'completion_time': 0.0125, 'prompt_time': 0.022856648, 'queue_time': 0.018633527999999996, 'total_time': 0.035356648}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop', 'logprobs': None} id='run-a39d6e9e-5c95-4269-ad99-fdf125867e8d-0' usage_metadata={'input_tokens': 182, 'output_tokens': 15, 'total_tokens': 197}
answer  >>after>>>  content='Try Now You can change the name of the TV on the network.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 182, 'total_tokens': 197, 'completion_time': 0.0125, 'prompt_time': 0.022856648, 'queue_time': 0.018633527999999996, 'total_time': 0.035356648}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop

## **PROMPT for generating metrics as JSON response**

In [None]:
def generate_prompt():
    """
    Generate a prompt template for assessing the support and relevance of an LLM-generated response.
    """
    return """
    I asked someone to answer a question based on one or more documents.
    Your task is to review their response and assess whether or not each sentence
    in that response is supported by text in the documents. And if so, which
    sentences in the documents provide that support. You will also tell me which
    of the documents contain useful information for answering the question, and
    which of the documents the answer was sourced from.
    Here are the documents, each of which is split into sentences.Alongside each
    sentence is associated key, such as ’[0a].’ or ’[0b].’ that you can use to refer
    to it:

    ‘‘‘
    {documents}
    ‘‘‘
    The question was:
    ‘‘‘
    {question}
    ‘‘‘

    Here is their response, split into sentences. Alongside each sentence is
    associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
    that these keys are unique to the response, and are not related to the keys
    in the documents:
    ‘‘‘
    {answer}
    ‘‘‘
    You must respond with a JSON object matching this schema:
    ‘‘‘
    {{
    "relevance_explanation": string,
    "all_relevant_sentence_keys": [string],
    "overall_supported_explanation": string,
    "overall_supported": boolean,
    "sentence_support_information": [
    {{
    "response_sentence_key": string,
    "explanation": string,
    "supporting_sentence_keys": [string],
    "fully_supported": boolean
    }},
    ],
    "all_utilized_sentence_keys": [string]
    }}
    ‘‘‘
    The relevance_explanation field is a string explaining which documents
    contain useful information for answering the question. Provide a step-by-step
    breakdown of information provided in the documents and how it is useful for
    answering the question.
    The all_relevant_sentence_keys field is a list of all document sentences keys
    (e.g. ’0a’) that are relevant to the question. Include every sentence that is
    useful and relevant to the question, even if it was not used in the response,
    or if only parts of the sentence are useful. Ignore the provided response when
    making this judgement and base your judgement solely on the provided documents
    and question. Omit sentences that, if removed from the document, would not
    impact someone’s ability to answer the question.
    The overall_supported_explanation field is a string explaining why the response
    *as a whole* is or is not supported by the documents. In this field, provide a
    step-by-step breakdown of the claims made in the response and the support (or
    lack thereof) for those claims in the documents. Begin by assessing each claim
    separately, one by one; don’t make any remarks about the response as a whole
    until you have assessed all the claims in isolation.
    The overall_supported field is a boolean indicating whether the response as a
    whole is supported by the documents. This value should reflect the conclusion
    you drew at the end of your step-by-step breakdown in overall_supported_explanation.
    In the sentence_support_information field, provide information about the support
    *for each sentence* in the response.
    The sentence_support_information field is a list of objects, one for each sentence
    in the response. Each object MUST have the following fields:
    - response_sentence_key: a string identifying the sentence in the response.
    This key is the same as the one used in the response above.

    - explanation: a string explaining why the sentence is or is not supported by the
    documents.
    - supporting_sentence_keys: keys (e.g. ’[0a]’) of sentences from the documents that
    support the response sentence. If the sentence is not supported, this list MUST
    be empty. If the sentence is supported, this list MUST contain one or more keys.
    In special cases where the sentence is supported, but not by any specific sentence,
    you can use the string "supported_without_sentence" to indicate that the sentence
    is generally supported by the documents. Consider cases where the sentence is
    expressing inability to answer the question due to lack of relevant information in
    the provided context as "supported_without_sentence". In cases where the sentence
    is making a general statement (e.g. outlining the steps to produce an answer, or
    summarizing previously stated sentences, or a transition sentence), use the
    string "general". In cases where the sentence is correctly stating a well-known fact,
    like a mathematical formula, use the string "well_known_fact". In cases where the
    sentence is performing numerical reasoning (e.g. addition, multiplication), use
    the string "numerical_reasoning".
    - fully_supported: a boolean indicating whether the sentence is fully supported by
    the documents.
    - This value should reflect the conclusion you drew at the end of your step-by-step
    breakdown in explanation.
    - If supporting_sentence_keys is an empty list, then fully_supported must be false.
    - Otherwise, use fully_supported to clarify whether everything in the response
    sentence is fully supported by the document text indicated in supporting_sentence_keys
    (fully_supported = true), or whether the sentence is only partially or incompletely
    supported by that document text (fully_supported = false).
    The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
    were used to construct the answer. Include every sentence that either directly supported
    the answer, or was implicitly used to construct the answer, even if it was not used
    in its entirety. Omit sentences that were not used, and could have been removed from
    the documents without affecting the answer.
    You must respond with a valid JSON string. Use escapes for quotes, e.g. \\"\\", and
    newlines, e.g. \\n. Do not write anything before or after the JSON string. Do not
    wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
    As a reminder: your task is to review the response and assess which documents contain
    useful information pertaining to the question, and how each sentence in the response
    is supported by the text in the documents.
    """.strip()


## **Building the LLMChain with Context, Query and Answer**

In [None]:
# Construct the prompt template
prompt_template_with_docs = PromptTemplate(
    input_variables=["documents", "question", "answer"],
    template=generate_prompt(),
)

# Create the LLMChain
llm_chain_with_docs = LLMChain(llm=llm, prompt=prompt_template_with_docs)

print("context >>> for llm >>>> " + context + "\n\n")
# print("query >>> for llm >>>> " + query + "\n\n")
print(f"answer >>> for llm >>>> {answer_pass_llm}\n\n")
print("----------------------------------------------\n\n")

# Run the LLMChain
try:
    print("Running the LLM with context, query, and answer...")
    answer_with_docs = llm_chain_with_docs.run(
        {"documents": context, "question": query, "answer": answer_pass_llm}
    )
    print("Generated response:\n", answer_with_docs)
    print("LLM Chain completed successfully.")
    answer_start = answer_with_docs.find("~~~~Answer:")
    if answer_start != -1:
        answer_resp = answer_with_docs[answer_start + len("~~~~Answer:"):].strip()
    else:
        answer_resp = answer_with_docs.strip()

    print("Cleaned Answer >>>>>", answer_resp)

except Exception as e:
    print("Error while running LLMChain:", e)

NameError: name 'PromptTemplate' is not defined

## **Response generation using groq using llama3-8b-8192**

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

chat = ChatGroq(temperature=0.3, groq_api_key="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", model_name="llama3-8b-8192")

prompt_template_with_docs = PromptTemplate(
    input_variables=["documents", "question", "answer"],
    template=generate_prompt(),
)

print('context for groq >>>> ', context)
print('query for groq >>>> ', query)
print('answer for groq >>>> ', groq_response)

chain = prompt_template_with_docs | chat

groq_response_with_context_qanda = chain.invoke({"documents": context, "question": query, "answer":groq_response})

print("groq_response>>>>with context, query and answer>>>>> ",groq_response_with_context_qanda)

context for groq >>>>  [-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1b] Settings General System Manager Device Name Try Now You can change the name of the TV on the network.
[-1a] Changing the name of the TV on a network.
query for groq >>>>  How do I change the name of the TV on a network?
answer for groq >>>>  content='According to the documents, to change the name of the TV on a network, you can:\n\n"Try Now You can change the name of the TV on the network."' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 35, 'prompt_tokens': 175, 'total_tokens': 210, 'completion_time': 0.029166667, 'prompt_time': 0.021095112, 'queue_time': 0.017219208, 'total_time': 0.050261779}, 'model_name':

In [None]:
def query_response_using_prompt_from_llm(query: str, prompt_template):
    chunks = retrieve_docs(query)
    # Flatten the list if necessary
    if any(isinstance(chunk, list) for chunk in chunks):
      chunks = [item for sublist in chunks for item in (sublist if isinstance(sublist, list) else [sublist])]

    context = "\n".join(chunks)

    chat = ChatGroq(temperature=0.3, groq_api_key="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", model_name="llama3-8b-8192")

    prompt_response_template= ChatPromptTemplate.from_template(
    """
    Please provide a response to the query below, strictly adhering to the
    information presented in the following documents.
    Do not generate any text beyond what is explicitly stated in the documents.

    Context: {context}

    Question: {query}

    Answer:
    """
    )

    chain = prompt_response_template | chat
    groq_response = chain.invoke({"context": context, "query": query})

    chain = prompt_template | chat
    groq_final_response = chain.invoke({"documents": context, "question": query, "answer":groq_response})


    print("groq_final_response >>  ",groq_final_response)

    return groq_final_response

## **JSON Data parsing to retrieve metrics**

In [None]:
import re
import json

In [None]:
def fix_unescaped_quotes(json_string):
    result = ""
    inside_string = False

    for i, char in enumerate(json_string):
        if char == '"':
            # Check if it's part of a JSON string value
            if not inside_string:
                inside_string = True
            elif inside_string:
                # Escape the quote if it's inside a string and not followed by a key-value structure
                if i + 1 < len(json_string) and json_string[i + 1] not in [':', ',', '}', ']']:
                    result += '\\'
                inside_string = False
        result += char

    return result

In [None]:
def extract_json_data(groq_response):
    # Extract the content field using regular expressions
    content_match = re.search(r"content='(.*?)' additional_kwargs=", str(groq_response), re.DOTALL)
    data = ""
    if content_match:
        content = content_match.group(1)
        print("Extracted Content:")
        print(content)

        json_match = re.search(r"\{.*\}", content, re.DOTALL)
        if json_match:
          json_str = json_match.group(0)

          json_str = json_str.replace("'", '"').replace("\\n","").replace("\\","")
          json_str = fix_unescaped_quotes(json_str)

          print(json_str)
          try:
            # Parse the JSON
            parsed_json = json.loads(json_str)
            print("Extracted JSON:")
            print(json.dumps(parsed_json, indent=4))
            data = parsed_json

          except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
        else:
          print("No JSON found in the provided string.")

    else:
        print("Content field not found in the provided string.")

    return data

In [None]:
data = extract_json_data(groq_response_with_context_qanda)

Extracted Content:
Here is the JSON response:\n\n```\n{\n  "relevance_explanation": "Document 1 contains useful information for answering the question. It provides the exact steps to change the name of the TV on a network.",\n  "all_relevant_sentence_keys": ["[-1a]"],\n  "overall_supported_explanation": "The response is partially supported by the documents. The response sentence is supported by sentence [-1a] in Document 1, which provides the exact steps to change the name of the TV on a network.",\n  "overall_supported": true,\n  "sentence_support_information": [\n    {\n      "response_sentence_key": "a",\n      "explanation": "The sentence is supported by sentence [-1a] in Document 1, which provides the exact steps to change the name of the TV on a network.",\n      "supporting_sentence_keys": ["[-1a]"],\n      "fully_supported": true\n    }\n  ],\n  "all_utilized_sentence_keys": ["[-1a]"]\n}\n```\n\nExplanation:\n\n* Relevance explanation: Document 1 contains useful information for

## **Computation Metrics from JSON response in comparison with ground truth**

In [None]:
import json

In [None]:
# Helper function for length computation (mocked as sentence count here)
def compute_length(keys):
    return len(keys)

# Metrics Computation
def compute_metrics(data):
    all_relevant = data["all_relevant_sentence_keys"]
    all_utilized = data["all_utilized_sentence_keys"]
    sentences_info = data["sentence_support_information"]

    # Context Relevance
    total_relevant_length = compute_length(all_relevant)
    total_context_length = total_relevant_length  # Assuming all relevant are part of the context
    context_relevance = total_relevant_length / total_context_length if total_context_length > 0 else 0

    # Context Utilization
    total_utilized_length = compute_length(all_utilized)
    context_utilization = total_utilized_length / total_context_length if total_context_length > 0 else 0

    # Completeness
    total_relevant_utilized = sum(
        1 for s in sentences_info if set(s["supporting_sentence_keys"]).intersection(all_utilized)
    )
    completeness = total_relevant_utilized / total_relevant_length if total_relevant_length > 0 else 0

    # Adherence
    adherence = all(s["fully_supported"] for s in sentences_info)

    return {
        "Context Relevance": context_relevance,
        "Context Utilization": context_utilization,
        "Completeness": completeness,
        "Adherence": adherence
    }

In [None]:
# Compute and print metrics
predicted_metrics = compute_metrics(data)
print(json.dumps(predicted_metrics, indent=4))

{
    "Context Relevance": 1.0,
    "Context Utilization": 1.0,
    "Completeness": 1.0,
    "Adherence": true
}


## **Fetching the groud truth values**

In [None]:
sub_dataset = load_dataset("rungalileo/ragbench", "emanual")

In [None]:
def fetch_ground_truth(dataset, question):
    for sample in dataset["train"]:  # Change "train" to the correct split if needed
        if sample["question"] == question:
            return {
                "Context Relevance": sample["relevance_score"],  # Adjust column name if needed
                "Context Utilization": sample["utilization_score"],  # Adjust column name if needed
                "Adherence": sample["adherence_score"]  # Adjust column name if needed
            }
    return None


In [None]:
ground_truth = fetch_ground_truth(sub_dataset, query)

if ground_truth:
    print("Ground Truth Values:")
    print(json.dumps(ground_truth, indent=4))
else:
    print(f"Question not found in the dataset: {query}")

Ground Truth Values:
{
    "Context Relevance": 0.047619047619047616,
    "Context Utilization": 0.047619047619047616,
    "Adherence": true
}


In [None]:
## DONOT RUN THIS
ground_truth = """{
    "Context Relevance": 0.6470588235294118,
    "Context Utilization": 0.35294117647058826,
    "Adherence": false
}"""

In [None]:
## DONOT RUN THIS
predicted_metrics = """{
    "Context Relevance": 1.0,
    "Context Utilization": 0.6,
    "Completeness": 1.0,
    "Adherence": true
}"""

## **Evaluation Metrics**

In [None]:
from sklearn.metrics import mean_squared_error, roc_auc_score
import numpy as np
import json

In [None]:
def compute_evaluation_metrics(predicted, ground_truth):
  # Convert JSON strings to Python dictionaries
  # ground_truth = json.loads(ground_truth)
  # predicted_metrics = json.loads(predicted)

  y_true_relevance = ground_truth["Context Relevance"]
  y_true_utilization = ground_truth["Context Utilization"]
  y_true_adherence = int(ground_truth["Adherence"])

  y_pred_relevance = predicted_metrics["Context Relevance"]
  y_pred_utilization = predicted_metrics["Context Utilization"]
  y_pred_adherence = int(predicted_metrics["Adherence"])

  # Compute RMSE for Context Relevance and Context Utilization
  rmse_relevance = np.sqrt((y_pred_relevance - y_true_relevance) ** 2)
  rmse_utilization = np.sqrt((y_pred_utilization - y_true_utilization) ** 2)

  return {
        "RMSE-Relevance": rmse_relevance,
        "RMSE-Utililization": rmse_utilization
    }

In [None]:
evaluation_metrics = compute_evaluation_metrics(predicted_metrics, ground_truth)

In [None]:
# Print Results
print("Ground Truth Values (JSON):")
print(json.dumps(ground_truth, indent=4))
print("\nPredicted Metrics:")
print(json.dumps(predicted_metrics, indent=4))
print("\nEvaluation Metrics (RMSE and AUC-ROC):")
print(json.dumps(evaluation_metrics, indent=4))

Ground Truth Values (JSON):
{
    "Context Relevance": 0.047619047619047616,
    "Context Utilization": 0.047619047619047616,
    "Adherence": true
}

Predicted Metrics:
{
    "Context Relevance": 1.0,
    "Context Utilization": 1.0,
    "Completeness": 1.0,
    "Adherence": true
}

Evaluation Metrics (RMSE and AUC-ROC):
{
    "RMSE-Relevance": 0.9523809523809523,
    "RMSE-Utililization": 0.9523809523809523
}


## **AUC-ROC**

In [None]:
questions = ["How to configure the IPv6 connection settings?", "Where do I find factory reset option?",
             "How do I access the main accessibility menu to change Voice Guide settings?","How do I use the accessibility Shortcuts menu?"]

In [None]:
def get_predicted_adherence_values_for_questions(questions, prompt_template):

  adherence_set = []
  for query in questions:
    groq_final_response = query_response_using_prompt_from_llm(query,prompt_template)
    json_data = extract_json_data(groq_final_response)
    metrics = compute_metrics(json_data)
    adherence_set.append(metrics["Adherence"])

  return adherence_set

In [None]:
def get_ground_truth_adherence_values_for_questions(questions, dataset):

  adherence_set = []
  for query in questions:
    metrics = fetch_ground_truth(dataset, query)
    print(json.dumps(metrics, indent=4))
    adherence_set.append(metrics["Adherence"])

  return adherence_set

In [None]:
def get_auc_roc_score(predicted_set, ground_tructh_set):
  mapping = {True: 1, False: 0}
  y_true_numeric_alt = [mapping[val] for val in ground_tructh_set]
  y_pred_numeric_alt = [mapping[val] for val in predicted_set]

  auc_roc = roc_auc_score(y_true_numeric_alt, y_pred_numeric_alt)
  print("AUC-ROC:", auc_roc)
  return auc_roc

In [None]:
prompt_template_with_docs = PromptTemplate(
    input_variables=["documents", "question", "answer"],
    template=generate_prompt(),
)

In [None]:
predicted_set = get_predicted_adherence_values_for_questions(questions, prompt_template_with_docs)
print(predicted_set)

groq_final_response >>   content='Here is the JSON response:\n\n```\n{\n  "relevance_explanation": "The documents do not contain any useful information for answering the question. The documents only consist of repeated references to setting up an Internet connection over IPv6, but do not provide any specific instructions or details on how to configure the IPv6 connection settings.",\n  "all_relevant_sentence_keys": [],\n  "overall_supported_explanation": "The response is not supported by the documents. The response correctly states that there is no information provided in the given context to answer the question, but this is not supported by the documents as they do not provide any specific instructions or details on how to configure the IPv6 connection settings.",\n  "overall_supported": false,\n  "sentence_support_information": [\n    {\n      "response_sentence_key": "a",\n      "explanation": "The sentence is supported by the fact that the documents do not provide any specific inst

In [None]:
dataset = load_dataset("rungalileo/ragbench", "emanual")
ground_tructh_set = get_ground_truth_adherence_values_for_questions(questions, dataset)
print(ground_tructh_set)

{
    "Context Relevance": 0.03125,
    "Context Utilization": 0.03125,
    "Adherence": true
}
{
    "Context Relevance": 0.17391304347826086,
    "Context Utilization": 0.043478260869565216,
    "Adherence": false
}
{
    "Context Relevance": 0.13953488372093023,
    "Context Utilization": 0.13953488372093023,
    "Adherence": true
}
{
    "Context Relevance": 0.20588235294117646,
    "Context Utilization": 0.3235294117647059,
    "Adherence": true
}
[True, False, True, True]


In [None]:
auc_roc = get_auc_roc_score(predicted_set, ground_tructh_set)

AUC-ROC: 0.3333333333333333
