# Evaluating Responses from Image, Text, and Combined Indexes --> WIP

***This notebook works best with the `conda_python3` on the `ml.t3.large` instance***.

---

This notebook does as follows:

1. Use the CSV file generated in the previous notebook to run evaluations on how the response quality is, based on the context, and the target responses that are given

1. Record the ROUGE, COSINE, BLEU scores, and for subjective evaluation, use an `LLM as a judge`(in this case, ClaudeV3 Sonnet)

1. Record the results for all kinds of responses from text only index, image only index, and combined (from both the text as well as the image index) from `OpenSearch`

## Step 1. Setup

Install the required Python packages and import the relevant files.

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
# import the libraries that are needed to run this notebook
import os
import re
import ray
import time
import glob
import json
import yaml
import time
import nltk
import boto3
import codecs
import base64
import logging
import requests
import botocore
import sagemaker
import numpy as np
import globals as g
from pathlib import Path
from nltk.tag import pos_tag
from typing import List, Dict
from nltk.tokenize import word_tokenize
from requests_auth_aws_sigv4 import AWSSigV4
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from utils import get_cfn_outputs, get_bucket_name, download_image_files_from_s3, get_text_embedding

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
if ray.is_initialized():
    ray.shutdown()
# ray.init(runtime_env={"working_dir": "./"})
ray.init()

In [None]:
# global constants
CONFIG_FILE_PATH = "config.yaml"
# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

In [None]:
!pygmentize globals.py

### Get all text descriptions for `Image` only responses in a Dataframe

Here, we fetch responses from the image index, image index to get a question bank created that acts as a curated dataset

In [None]:
## Represents extracted all metric files
image_json_content = [pos_json for pos_json in os.listdir(config['pdf_dir_info']['json_img_dir']) if pos_json.endswith('.json')]
# Get relative file paths by joining directory path with each file name
image_json_files = [os.path.join(config['pdf_dir_info']['json_img_dir'], file) for file in image_json_content]
logger.info(f"there are {len(image_json_files)} image files in {config['pdf_dir_info']['json_img_dir']}")

In [None]:
# view the image only df
image_index_contents = []
for f in image_json_files:
    image_index_contents.append(json.loads(Path(f).read_text()))
img_index_df = pd.DataFrame(image_index_contents)
# img_index_df = img_index_df.rename(columns={'file_name': 'image_file_name', 'text': 'image_text', 'page_number': 'image_page_number'})
img_index_df.head(10)

### Get all text descriptions for `Text` only responses in a Dataframe

In [None]:
# Now, we fetch responses from text only index responses
text_json_content = [pos_json for pos_json in os.listdir(config['pdf_dir_info']['json_txt_dir']) if pos_json.endswith('.json')]
# Get relative file paths by joining directory path with each file name
text_json_files = [os.path.join(config['pdf_dir_info']['json_txt_dir'], file) for file in text_json_content]
logger.info(f"there are {len(text_json_files)} image files in {config['pdf_dir_info']['json_txt_dir']}")

In [None]:
# view the image only df
text_index_contents = []
for f in text_json_files:
    text_index_contents.append(json.loads(Path(f).read_text()))
txt_index_df = pd.DataFrame(text_index_contents)
# txt_index_df = txt_index_df.rename(columns={'file_name': 'txt_file_name', 'text': 'text', 'page_number': 'txt_page_number'})
txt_index_df.head(10)

#### View all of the content from text AND image only indexes

In [None]:
merged_text_and_image_df = pd.concat([txt_index_df, img_index_df], ignore_index=True)
merged_text_and_image_df.reset_index(drop=True, inplace=True)
merged_text_and_image_df = merged_text_and_image_df.rename(columns={'text': 'content_description'})
merged_text_and_image_df = merged_text_and_image_df.sort_values(by=['file_type', 'file_name', 'page_number'])
merged_text_and_image_df.head(10)

In [None]:
# convert the raw data from both text and image only index and save in a metrics dir
metrics_dir: str = config['metrics_dir']['dir_name']
os.makedirs(metrics_dir, exist_ok=True)
all_content_description_file: str = os.path.join(metrics_dir, config['metrics_dir']['text_and_image_raw_content'])
merged_text_and_image_df.to_csv(all_content_description_file, index=True)
logger.info(f"all intial content description on description from text and image only indexes are saved in: {all_content_description_file}")

In [None]:
def get_cosine_similarity(text1: str, text2: str) -> float:
    """
    This function calculates the cosine similarity between the chapter title generated from models, and the human generated title (if any)
    """
    A,_ = get_embedding(text1)
    B,_ = get_embedding(text2)
    cosine = dot(A, B)/(norm(A)*norm(B))
    return cosine

def get_rouge_l_score(completion: str, golden: str) -> float:
    """
    This function calculates the rouge-l score between the chapter title generated from models, and the human generated title (if any)
    """
    rouge_metric_selection: str = config['embeddings_model_info']['rouge_metric_selection']
    scorer = rouge_scorer.RougeScorer([rouge_metric_selection])
    scores = scorer.score(golden, completion)
    return round(scores[rouge_metric_selection].fmeasure, 4)

## Generate `Question-Answer` banks for each content described by (Image index and Text index) and generate a `5-10 Pairs` of Question and Answers based on the `Content Description`

In [None]:
def llm_QA_generator(model_id: str, prompt: str):
    # represents the service name
    service_name: str = "bedrock"
    # represents creating the bedrock model to invoke the litellm api for response for titan, llama and claude
    bedrock_model: str = f"{service_name}/{model_id}"
    # represents the current aws region
    aws_region = boto3.Session().region_name 
    # initialize the response dict
    ret = dict(exception = None,
               prompt = prompt,
               completion = None,
               file_name = None,
               # initializing to 0 since none type throws an error later, this is used to calculate price per token input/output on ODT pricing
               completion_token_count = 0,
               # initializing to 0 since none type throws an error later
               prompt_token_count=0,
               input_token_price = None, 
               output_token_pricing = None,
               model_id = model_id)
    body = ret['prompt']
    os.environ["AWS_REGION_NAME"] = aws_region
    parameters = config['inference_parameters_for_qna_generation']
    temperature = parameters.get('temperature', 0.1)
    caching = parameters.get('caching', False)
    max_tokens = parameters.get("max_tokens", 500)
    try:
        # Represents calling the litellm completion/messaging api utilizing the completion/embeddings API
        logger.info(f"Invoking {bedrock_model}......")
        response = completion(model=bedrock_model,
                              messages=[{ "content": body,"role": "user"}],
                              temperature=temperature,
                              max_tokens=max_tokens,
                              caching=caching)
        # iterate through the entire model response
        for idx, choice in enumerate(response.choices):
            # extract the message and the message's content from litellm
            if choice.message and choice.message.content:
                # extract the response from the dict
                ret["completion"] = choice.message.content.strip()
        # Extract number of input and completion prompt tokens (this is the same structure for embeddings and text generation models on Amazon Bedrock)
        ret['QnA_prompt_token_count'] = response.usage.prompt_tokens
        ret['QnA_completion_token_count'] = response.usage.completion_tokens
    except Exception as e:
        logger.error(f"Exception occurred during invoking {model_id}, exception={e}")
        ret['exception'] = e
    logger.info(f"completion: {ret['completion']}")
    return ret

In [None]:
config['QnA_generator_prompt']

In [None]:
def get_inference(i: int, row: Dict, total: int) -> Dict:
    # save all the responses from the model in a dictionary
    resp: Dict = {}
    logger.info(f"row {i}/{total}, model_id={config['bedrock_model_info']['claude_sonnet_model_id']}")
    model_id = config['bedrock_model_info']['claude_sonnet_model_id']
    # create the payload for model inference
    prompt = config['QnA_generator_prompt'].format(context=row['content_description'])
    # generate the chapter title based on the given chapter in the prompt 
    resp = llm_QA_generator(model_id, prompt)
    resp['file_type'] = row['file_type']
    resp['page_number'] = row['page_number']
    resp['file_type'] = row['file_type']
    resp['file_name'] = row['file_name']
    resp['content_description'] = row['content_description']
    resp['page_number'] = row['page_number']
    # calculate the input and output token price for all of the calls
    resp['QnA_input_token_price'] = (resp['prompt_token_count']/1000) * config['bedrock_model_info']['claude_input_tokens_pricing']
    logger.info(f"The price for {resp['prompt_token_count']} tokens for {model_id} for filename={row['file_name']} is {resp['QnA_input_token_price']}")
    resp['QnA_output_token_price'] = (resp['QnA_prompt_token_count']/1000) * config['bedrock_model_info']['claude_output_tokens_pricing']
    logger.info(f"The price for {resp['QnA_completion_token_count']} tokens for {model_id} for filename={row['file_name']} is {resp['QnA_output_token_price']}")
    dir_path = os.path.join(config['pdf_dir_info']['qna_dir'], row['file_name'], model_id.replace(":", "-"))
    os.makedirs(dir_path, exist_ok=True)
    fpath = os.path.join(dir_path, f"question_answers_{row['file_name']}.json")
    logger.info(f"writing response={resp} to {fpath}")
    Path(fpath).write_text(json.dumps(resp, default=str, indent=2))
    logger.info(f"response {i}: {resp}")
    return resp

In [None]:
@ray.remote
def async_get_inference(i: int, row: Dict, total: int) -> Dict:
    logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
    logger = logging.getLogger(__name__)
    return get_inference(i, row, total)

In [None]:
# merged_text_and_image_df = pd.DataFrame(merged_text_and_image_df)
# merged_text_and_image_json = merged_text_and_image_df.to_json(orient='records')
merged_text_and_image_df = json.loads(merged_text_and_image_df.to_json(orient='records'))
n: int = config['parallel_inference_count']
resp_list: List = []
st = time.perf_counter()
logger.info(f"------ Generating QnA bank using {config['bedrock_model_info']['claude_sonnet_model_id']} -----")
list_of_lists = [merged_text_and_image_df[i * n:(i + 1) * n] for i in range((len(merged_text_and_image_df) + n - 1) // n )]
logger.info(f"split input list of size {len(merged_text_and_image_df)} into {len(merged_text_and_image_df)} lists")
for idx, l in enumerate(list_of_lists):
    logger.info(f"getting inference for list {idx+1}/{len(list_of_lists)}, size of list={len(l)} ")
    resp_list.extend(ray.get([async_get_inference.remote(i+1, e, len(l)) for i, e in enumerate(l)]))
elapsed_time = time.perf_counter() - st
logger.info(f"------ model={config['bedrock_model_info']['claude_sonnet_model_id']} completed in {elapsed_time} ------ ")

## Clean Up
