# Evaluating Responses from Image, Text, and Combined Indexes --> WIP

***This notebook works best with the `conda_python3` on the `ml.t3.large` instance***.

---

This notebook does as follows:

1. Use the CSV file generated in the previous notebook to run evaluations on how the response quality is, based on the context, and the target responses that are given

1. Record the ROUGE, COSINE, BLEU scores, and for subjective evaluation, use an `LLM as a judge`(in this case, ClaudeV3 Sonnet)

1. Record the results for all kinds of responses from text only index, image only index, and combined (from both the text as well as the image index) from `OpenSearch`

## Step 1. Setup

Install the required Python packages and import the relevant files.

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
# import the libraries that are needed to run this notebook
import os
import re
import ray
import time
import glob
import json
import yaml
import time
import nltk
import boto3
import codecs
import base64
import logging
import requests
import botocore
import sagemaker
import numpy as np
import pandas as pd
import globals as g
from numpy import dot
from pathlib import Path
from nltk.tag import pos_tag
from typing import List, Dict
from numpy.linalg import norm
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
from bedrock_utils import get_bedrock_client
from requests_auth_aws_sigv4 import AWSSigV4
from nltk.translate.bleu_score import sentence_bleu
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from utils import get_cfn_outputs, get_bucket_name, download_image_files_from_s3, get_text_embedding

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
if ray.is_initialized():
    ray.shutdown()
ray.init()

In [None]:
# global constants
CONFIG_FILE_PATH = "config.yaml"
# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

In [None]:
!pygmentize globals.py

### Get all the responses from the evaluation dataset in a dataframe to calculate ROUGE/COSINE and `LLM as a Judge` evaluation metrics

Here, we fetch responses from the image index, image index to get a question bank created that acts as a curated dataset

In [None]:
if config['metrics_dir']['dir_name'] is not None:
    metrics_fpath = os.path.join(config['metrics_dir']['dir_name'], "*.csv")
    metric_files = glob.glob(metrics_fpath, recursive=True)
    logger.info(f"there are {len(metric_files)} files in {metrics_fpath}")
    for file in metric_files:
        eval_df = pd.read_csv(file) 
        eval_df = eval_df.loc[:, ~eval_df.columns.str.startswith('Unnamed')]

eval_df.head(10)

### ROUGE & Cosine Similarity Scores for multimodal completions:
---

Here, the amazon.titan-embed-text-v1 is used to get the embeddings of texts. To use a different embeddings model, change the model in the embeddings_model_info and modify this function.

In [None]:
from typing import Optional
MAX_TEXT_LEN_FOR_EMBEDDING: int = config['embeddings_model_info']['max_text_len_for_embedding']
bedrock: Optional[get_bedrock_client] = None

def get_embedding(text: str, modelId: str=config['embeddings_model_info'].get('model'), accept: str='application/json', contentType: str='application/json'):
    """
    Generates embeddings for the responses from the image/text indexes and the target responses if any are
    provided in the dataset
    """
    global bedrock
    if bedrock is None:
        bedrock = get_bedrock_client()
    body = json.dumps({"inputText": text[:MAX_TEXT_LEN_FOR_EMBEDDING]})
    response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    token_count = response_body.get('inputTextTokenCount')
    return embedding, token_count

def get_cosine_similarity(text1: str, text2: str) -> float:
    """
    This function calculates the cosine similarity between the image/text indexes and the target responses if any
    """
    A,_ = get_embedding(text1)
    B,_ = get_embedding(text2)
    cosine = dot(A, B)/(norm(A)*norm(B))
    return cosine

def get_rouge_l_score(completion: str, golden: str) -> float:
    """
    This function calculates the rouge-l score between the image/text indexes and the target responses (if any)
    """
    rouge_metric_selection: str = config['embeddings_model_info']['rouge_metric_selection']
    scorer = rouge_scorer.RougeScorer([rouge_metric_selection])
    scores = scorer.score(golden, completion)
    return round(scores[rouge_metric_selection].fmeasure, 4)

def get_bleu_score(reference: str, hypothesis: str, weights: tuple = (0.25, 0.25, 0.25, 0.25)) -> float:
    reference = [reference.split()]
    hypothesis = hypothesis.split()
    score = sentence_bleu(reference, hypothesis, weights=weights)
    return round(score, 4)

In [None]:
target_response_key: str = config['eval_qna_dataset_info']['target_response_key']
def compare_completions(row, index_type):
    """
    Generates the rouge and cosine similarity scores for chapter titles and original titles
    """
    if (row.get(index_type) and row.get(target_response_key) is not None) and (pd.notna(row.get(index_type)) and pd.notna(row.get(target_response_key))):
        logger.info(f"{index_type} response: {row[index_type]}, Target response: {row[target_response_key]}")
        rouge_l_score = get_rouge_l_score(row[index_type], row[target_response_key])
        cosine_sim = get_cosine_similarity(row[index_type].lower(), row[target_response_key].lower())
        bleu_score = get_bleu_score(row[target_response_key], row[index_type], weights=(0.5, 0.5))
        return pd.Series([rouge_l_score, cosine_sim, bleu_score])
    else:
        logger.info(f'ROUGE, Cosine similarity and Bleu scores cannot be computed since original responses are not provided in the dataset')
        rouge_l_score, cosine_sim, bleu_score = None, None, None

if target_response_key in eval_df.columns:
    for metric in config['embeddings_model_info']['get_quantitative_metrics_on']:
        eval_df[[f'{metric}_rouge_l_f1_score', f'{metric}_cosine_similarity', f'{metric}_bleu_score']] = eval_df.apply(lambda row: compare_completions(row, index_type=metric), axis=1)
else:
    logger.info('No evaluation metrics available since target responses are not provided in the dataset.')

In [None]:
eval_df.drop(columns={'Matching Document', 'image_and_text_source', 'text_source', 'img_source'}, inplace=True)
if config['eval_qna_dataset_info']['target_response_key'] in eval_df.columns:
    eval_df.rename(columns = {'Response':'target_response'}, inplace = True)
# Construct the file path
metrics_dir: str = config['metrics_dir']['dir_name']
rouge_cosine_file_path = os.path.join(metrics_dir, config['metrics_dir']['eval_score_dataset'])
eval_df.to_csv(rouge_cosine_file_path, index=False)
eval_df.head(10)

### Use `LLM as a Judge` to evaluate responses from different indexes - WIP

## Clean Up
