# Multimodal RAG Evaluation: `ROUGE`, `Cosine` & `LLM as a Judge`

***This notebook works best with the `conda_python3` on the `ml.t3.large` instance***.

---

This notebook does as follows:

1. Uses the CSV file generated in the previous notebook to run evaluations on each response from the `image`, `text`, and `combined` indexes.

1. Records the `ROUGE` and `Cosine Similarity` scores. For subjective evaluation, this notebook uses an `LLM as a judge`(in this case, ClaudeV3 Sonnet) in the loop to check for the best match answer given the `target response` and the `questions` provided by the user.

1. Records the results for all kinds of responses from `text only index`, `image only index`, and `combined` (from both the text as well as the image index) from `OpenSearch`

1. Uses `litellm` for interfacing with Bedrock

## Step 1. Setup

Install the required Python packages and import the relevant files.

In [None]:
# Install all the requirements
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
# import the libraries that are needed to run this notebook
import os
import ray
import time
import glob
import json
import yaml
import time
import boto3
import botocore
import logging
import globals as g
import pandas as pd
from numpy import dot
from pathlib import Path
from numpy.linalg import norm
from litellm import completion ## support for text generation models on bedrock
from rouge_score import rouge_scorer
from typing import List, Dict, Optional
from utils import load_and_merge_configs
from bedrock_utils import get_bedrock_client

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
if ray.is_initialized():
    ray.shutdown()
ray.init()

In [None]:
# load the merged config file - user config file, and parent config file
config = load_and_merge_configs(g.CONFIG_SUBSET_FILE, g.FULL_CONFIG_FILE)
logger.info(f"config file -> {json.dumps(config, indent=2)}")

### Get all the responses from the evaluation dataset in a dataframe to calculate `ROUGE`, `Cosine Similarity` and `LLM as a Judge` evaluation metrics
---

In [None]:
QUERY_COL: str = config['eval_qna_dataset_info']['question_key']
region = boto3.Session().region_name 
QUERY_COL

In [None]:
# if the metrics path is not none, then we get the updated evaluation file with responses to 
# user provided questions from the text/image/combined indexes
if config['dir_info']['metrics_dir_name'] is not None:
    metrics_fpath: str = os.path.join(config['dir_info']['metrics_dir_name'], config['eval_qna_dataset_info']['updated_eval_file'])
    metric_files = glob.glob(metrics_fpath, recursive=True)
    logger.info(f"there are {len(metric_files)} files in {metrics_fpath}")
    for file in metric_files:
        eval_df = pd.read_csv(file)
        # drop columns that are not needed
        eval_df = eval_df.loc[:, ~eval_df.columns.str.startswith('Unnamed')]
eval_df.head(10)

### Calculate the `ROUGE` & `Cosine Similarity` Scores for completions:
---

Here, the amazon.titan-embed-text-v1 is used to get the embeddings of texts. To use a different embeddings model, change the model in the embeddings_model_info and modify this function.

In [None]:
MAX_TEXT_LEN_FOR_EMBEDDING: int = config['model_info']['embeddings_model_info'].get('max_text_len_for_embedding')
bedrock: Optional[get_bedrock_client] = None

def get_embedding(text: str, modelId: str=config['model_info']['embeddings_model_info'].get('model_id'), accept: str='application/json', contentType: str='application/json'):
    """
    Generates embeddings for the responses from the image/text indexes and the target responses if any are
    provided in the dataset
    """
    global bedrock
    if bedrock is None:
        bedrock = get_bedrock_client()
    body = json.dumps({"inputText": text[:MAX_TEXT_LEN_FOR_EMBEDDING]})
    response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    token_count = response_body.get('inputTextTokenCount')
    return embedding, token_count

def get_cosine_similarity(text1: str, text2: str) -> float:
    """
    This function calculates the cosine similarity between the image/text indexes and the target responses if any
    """
    A,_ = get_embedding(text1)
    B,_ = get_embedding(text2)
    cosine = dot(A, B)/(norm(A)*norm(B))
    return cosine

def get_rouge_l_score(completion: str, golden: str) -> float:
    """
    This function calculates the rouge-l score between the image/text indexes and the target responses (if any)
    """
    rouge_metric_selection: str = config['model_info']['embeddings_model_info'].get('rouge_metric_selection')
    scorer = rouge_scorer.RougeScorer([rouge_metric_selection])
    scores = scorer.score(golden, completion)
    return round(scores[rouge_metric_selection].fmeasure, 4)

In [None]:

# target_response_key is the target response column name, if any in the evaluation dataset that the user provides
target_response_key: str = config['eval_qna_dataset_info']['target_response_key']
def compare_completions(row, index_type):
    """
    Generates the rouge and cosine similarity scores for chapter titles and original titles
    """
    if (row.get(index_type) and row.get(target_response_key) is not None) and (pd.notna(row.get(index_type)) and pd.notna(row.get(target_response_key))):
        logger.info(f"{index_type} response: {row[index_type]}, Target response: {row[target_response_key]}")
        rouge_l_score = get_rouge_l_score(row[index_type], row[target_response_key])
        cosine_sim = get_cosine_similarity(row[index_type].lower(), row[target_response_key].lower())
        return pd.Series([rouge_l_score, cosine_sim])
    else:
        logger.info(f'ROUGE, Cosine similarity and Bleu scores cannot be computed since original responses are not provided in the dataset')
        rouge_l_score, cosine_sim = None, None

if target_response_key in eval_df.columns:
    for metric in g.QUALITATIVE_METRICS_LIST:
        eval_df[[f'{metric}_rouge_l_f1_score', f'{metric}_cosine_similarity']] = eval_df.apply(lambda row: compare_completions(row, index_type=metric), axis=1)
else:
    logger.info('No evaluation metrics available since target responses are not provided in the dataset.')

In [None]:
if config['eval_qna_dataset_info']['target_response_key'] in eval_df.columns:
    eval_df.rename(columns = {'text_response': 'text_only_response', 'img_response': 'img_only_response'}, inplace = True)
    eval_df = eval_df.drop(columns=['image_and_text_source', 'text_source', 'img_source'])
# Construct the file path
metrics_dir: str = config['dir_info']['metrics_dir_name']
rouge_cosine_file_path = os.path.join(metrics_dir, config['dir_info']['eval_score_dataset'])
eval_df.to_csv(rouge_cosine_file_path, index=False)
eval_df.head(10)

### Use `LLM as a Judge` to evaluate responses from different indexes 

### Evaluation: Using LLM as a Judge in the loop
---

Responses generated by each index (and combined responses from both indexes) are evaluated on relevance and best match (to ground truth) by _Claude Sonnet/Your model of choice_. Prompt for the model that acts as a judge in the loop can be viewed in: [`eval_template.txt`]('prompt_templates/eval_template.txt'). Edit and review this prompt based on the use case and criteria for subjective evaluation.

The role of the model acting as a judge it to compare the responses generated by each index to a target response and checked which index gives a response that best matches the `target response` given the question. It provides information on the `best selected response`, `response source` (from the text only index/image only index/or both indexes), and an `explanation` of its selection, with an in depth analysis of comparison between other responses and why it chose the one it did. 

A final evaluation metric is calculated that shows the distribution of the selected responses and their respective response sources. This will give a judgement call of which multimodal strategy to use in production ready workloads.

Note: For more information on the use of having a Model act as a judge, view: https://huggingface.co/learn/cookbook/en/llm_judge

#### Prepare the evaluation prompt payloads

Here, the evaluation prompt template is used by the LLM judge to evaluate different chapter titles and suggest the most suitable title based on the evaluation criteria mentioned in the prompt template.

In [None]:
def prepare_eval_prompts(row):
    """
    This function evaluates the prompts by incorporating all of the responses generated by various indexes into the evaluation prompt template.
    """
    # represents the eval template used by the model judge
    eval_template: Optional[str] = None
    processed_eval_template: Optional[str] = None
    candidate_responses: List[str] = []
    try:
        # file path to the eval template
        eval_template: str = Path(os.path.join(config['dir_info']['prompt_dir'], config['dir_info']['eval_prompt_template'])).read_text()
    except FileNotFoundError:
        print(f"Error: Evaluation template not found at {eval_template}")
    logger.info(f"question: {QUERY_COL}")
    logger.info(f"original_response: {target_response_key}")
    for column in row.index:
        if column.endswith("_response") and column != target_response_key:
            response_source = column.split("_response")[0]
            candidate_response = row[column]
            candidate_responses.append(f"\n<{response_source}>\n{candidate_response}\n</{response_source}>\n")
    processed_eval_template = eval_template.format(
        question=row[QUERY_COL], 
        original_response=row[target_response_key],
        candidate_responses="\n".join(candidate_responses)
    )
    return processed_eval_template

#### Add evaluation prompt as a column into a df with respective response index sources and responses to send into the Model for further evaluation in the loop

In [None]:
if eval_df is not None:
    eval_df['eval_prompt'] = eval_df.apply(lambda r: prepare_eval_prompts(r), axis=1)
    logger.info("preparing the evaluation prompt templates for the LLM judge....")
else:
    logger.error(f"Model evaluation dataset is not available to process.")
llm_as_a_judge_eval_df_f_path = os.path.join(metrics_dir, config['dir_info']['processed_prompts_for_eval'])
eval_df.insert(0, 'query_id', eval_df.index)
eval_df.to_csv(llm_as_a_judge_eval_df_f_path, index=False)
eval_df.head(10)

### Using `LLM (Claude) as a judge` in the loop to evaluate get a judgement on which Index gives the best response based on the question (`image only/text only/image and text indexes combined`)

In [None]:
def llm_judge_json_evaluations(model_id: str, prompt: str):
    # represents the service name
    service_name: str = "bedrock"
    # represents creating the bedrock model to invoke the litellm api for response for titan, llama and claude
    bedrock_model: str = f"{service_name}/{model_id}"
    # represents the current aws region
    aws_region = boto3.Session().region_name 
    # initialize the response dict
    ret = dict(exception = None,
               prompt = prompt,
               completion = None,
               question = None,
               target_response = None, 
               completion_token_count = None,
               prompt_token_count= None,
               input_token_price = None, 
               output_token_price = None,
               model_id = model_id)
    body = ret['prompt']
    os.environ["AWS_REGION_NAME"] = aws_region
    parameters = config['inference_parameters']
    temperature = config['inference_parameters'].get('temperature', 0.1)
    max_tokens = config['inference_parameters'].get('max_tokens', 0.1)
    try:
        # Represents calling the litellm completion/messaging api utilizing the completion/embeddings API
        logger.info(f"Invoking {bedrock_model}......")
        response = completion(model=bedrock_model,
                              messages=[{ "content": body,"role": "user"}],
                              temperature=temperature,
                              max_tokens=max_tokens)
        # iterate through the entire model response
        for idx, choice in enumerate(response.choices):
            # extract the message and the message's content from litellm
            if choice.message and choice.message.content:
                # extract the response from the dict
                ret["completion"] = choice.message.content.strip()
        # Extract number of input and completion prompt tokens (this is the same structure for embeddings and text generation models on Amazon Bedrock)
        ret['prompt_token_count'] = response.usage.prompt_tokens
        ret['completion_token_count'] = response.usage.completion_tokens
    except Exception as e:
        logger.error(f"Exception occurred during invoking {model_id}, exception={e}")
        ret['exception'] = e
    logger.info(f"completion: {ret['completion']}")
    return ret

In [None]:
def get_inference(i: int, row: Dict, total: int, judge_model_info: Dict) -> Dict:
    # save all the responses from the model in a dictionary
    resp: Dict = {}
    print(f"row={row}")
    logger.info(f"row {i}/{total}, prompt_template={config['dir_info']['eval_prompt_template']}, model_id={judge_model_info['model_id']}")
    model_id = judge_model_info['model_id']
    # create the payload for model inference
    prompt = row['eval_prompt']
    # generate the chapter title based on the given chapter in the prompt 
    resp = llm_judge_json_evaluations(model_id, prompt)
    resp[QUERY_COL] = row[QUERY_COL]
    resp[target_response_key] = row[target_response_key]
    # calculate the input and output token price for all of the calls
    resp['input_token_price'] = (resp['prompt_token_count']/1000) * judge_model_info['input_tokens_price']
    resp['output_token_price'] = (resp['completion_token_count']/1000) * judge_model_info['output_tokens_price']
    dir_path = os.path.join(config['dir_info']['judge_model_eval_completions'], str(row['query_id']), model_id.replace(":", "-"))
    os.makedirs(dir_path, exist_ok=True)
    fpath = os.path.join(dir_path, f"model_evaluation_{row['query_id']}.json")
    logger.info(f"writing response={resp} to {fpath}")
    Path(fpath).write_text(json.dumps(resp, default=str, indent=2))
    logger.info(f"response {i}: {resp}")
    return resp

@ray.remote
def async_get_inference(i: int, row: Dict, total: int, model_info: Dict) -> Dict:
    logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
    logger = logging.getLogger(__name__)
    return get_inference(i, row, total, model_info)

In [None]:
eval_records_list = json.loads(eval_df.to_json(orient='records'))
n: int = config['inference_info']['parallel_inference_count']
from typing import List
resp_list: List = []
judge_model_info = config['model_info']['eval_model_info']
st = time.perf_counter()
logger.info(f"------ running inference for {judge_model_info.get('model_id')} -----")
list_of_lists = [eval_records_list[i * n:(i + 1) * n] for i in range((len(eval_records_list) + n - 1) // n )]
logger.info(f"split input list of size {len(eval_records_list)} into {len(list_of_lists)} lists")
for idx, l in enumerate(list_of_lists):
    logger.info(f"getting inference for list {idx+1}/{len(list_of_lists)}, size of list={len(l)} ")
    resp_list.extend(ray.get([async_get_inference.remote(i+1, e, len(l), judge_model_info) for i, e in enumerate(l)]))
elapsed_time = time.perf_counter() - st
logger.info(f"------ model={judge_model_info.get('model_id')} completed in {elapsed_time} ------ ")

### Visualize `LLM as a judge` completions and get more evaluation metrics

In [None]:
## Represents extracted all metric files
fpath_evaluated_files = os.path.join(config['dir_info']['judge_model_eval_completions'], "**", "*", "*.json")
eval_metric_files = glob.glob(fpath_evaluated_files, recursive=True)
logger.info(f"there are {len(eval_metric_files)} evaluated files by {config['model_info']['eval_model_info'].get('model_id')} LLM judge in {fpath_evaluated_files}")

In [None]:
model_evaluation_responses = []
for f in eval_metric_files:
    with open(f, 'r') as file:
        model_evaluation_responses.append(json.loads(file.read()))
# results_df will contain the evaluation responses, including the completion and the model id
results_df = pd.DataFrame(model_evaluation_responses)
results_df = results_df.drop(columns=['exception', 'prompt'])
results_df.head(10)

In [None]:
def parse_as_json(x: str) -> Optional[Dict]:
    """
    Convert a string into a dictionary. Remove any
    stray whitespaces which could break the json parsing
    """
    d: Optional[Dict] = None
    try:
        x = x.replace("\n", "").replace("\t", "")
        d = json.loads(x)
    except Exception as e:
        print(f"parse_as_json, error parsing string as json, string={x}")
    return d

In [None]:
def tidy_split(df, column, sep=',', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [None]:
new_results_df = results_df['completion'].apply(parse_as_json).apply(pd.Series)
new_results_df.insert(0, QUERY_COL, new_results_df.index)
new_results_df.insert(1, target_response_key, new_results_df.index)
new_results_df[QUERY_COL] = results_df[QUERY_COL]
new_results_df[target_response_key] = results_df[target_response_key]
new_exploded_df = tidy_split(new_results_df, 'response_source', sep=',')
new_exploded_df['response_source'] = new_results_df['response_source'].str.replace('<', '').str.replace('>', '')
logger.info(f"All evaluation data is read into a dataframe of shape {results_df.shape}")
processed_prompts_for_eval_path = os.path.join(metrics_dir, config['dir_info']['llm_as_a_judge_completions'])
new_results_df.to_csv(processed_prompts_for_eval_path, index=False)
# display the selected title, model explanation and the respective golden title in a side by side view
new_results_df.head(10)

In [None]:
# Compute the percentage of each model selection and reset the index
new_exploded_df['response_source'] = new_exploded_df['response_source'].map(lambda x: x.strip())
response_index_percentage_df = new_exploded_df['response_source'].value_counts(normalize=True).reset_index()
response_distribution_fpath = os.path.join(metrics_dir, config['dir_info']['index_response_distribution'])
response_index_percentage_df.rename(columns = {'response_source':'index_pick_rate'}, inplace = True)
response_index_percentage_df['index_pick_rate'] *= 100
response_index_percentage_df.to_csv(response_distribution_fpath, index=False)
response_index_percentage_df.head(10)

### Getting a final summary of which strategy to use for this solution
---

This portion gets all of the LLM explanations and best match responses to user question, the LLM pick rate, and gives a final summary of which strategy to use for this solution (`combined`, `text_only` or `image_only` or all three)

In [None]:
# simple function to get a final summary on all of the data provided from LLM as a judge
def final_analysis_summary(bedrock: botocore.client, 
                           prompt: str, 
                           modelID: str) -> str:
    """
    This function takes in the prompt that checks whether the text file has a response to the question and if not, 
    returns "not found" to move to the next hit
    """
    modelId=modelID
    body = json.dumps(
    {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2000,
        "temperature": 0.1,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                ],
            }
        ],
    })

    try:
        response = bedrock.invoke_model(
        modelId=modelId,
        body=body)

        response_body = json.loads(response['body'].read().decode("utf-8"))
        llm_response = response_body['content'][0]['text'].replace('"', "'")

    except Exception as e:
        logger.error(f"exception={e}")
        llm_response = None
    return llm_response

In [None]:
new_results_df

In [None]:
ALL_EXPLANATIONS_FPATH: str = os.path.join(metrics_dir, config['dir_info']['all_explanations'])
with open(ALL_EXPLANATIONS_FPATH, 'w') as file:
    for index, row in new_results_df.iterrows():
        file.write(f"Question: {row[QUERY_COL]}\nSelected Index: {row['response_source']}\nExplanation: {row['explanation']}\n\n")

# Read the content back to use as analysis context
with open(ALL_EXPLANATIONS_FPATH, 'r') as file:
    analysis_context = file.read()
print(analysis_context)

In [None]:
# open the prompt template and prepare it for inference
processed_summary_eval_prompt: str = Path(os.path.join(config['dir_info']['prompt_dir'], config['dir_info']['final_llm_as_a_judge_summary_analysis'])).read_text().format(context=analysis_context)
endpoint_url: str = g.BEDROCK_EP_URL.format(region=region)
bedrock = boto3.client(service_name="bedrock-runtime", endpoint_url=endpoint_url)
final_analysis: str = final_analysis_summary(bedrock, prompt=processed_summary_eval_prompt, modelID=config['model_info']['final_analysis_llm_summarizer'].get('model_id'))

In [None]:
print(final_analysis)

In [None]:
FINAL_SUMMARY_ANALYSIS: str = os.path.join(metrics_dir, config['dir_info']['final_summary_analysis'])
Path(FINAL_SUMMARY_ANALYSIS).write_text(final_analysis + "\n")