Using this notebook to test different LLM evaluation techniques

[ragas framework](https://github.com/explodinggradients/ragas)
[llamaindex](https://gpt-index.readthedocs.io/en/v0.6.36/how_to/evaluation/evaluation.html)

In [None]:
!pip install -Uq pysbd

In [None]:
# Handful of configuration

import os
import boto3
import json
import pathlib
import re
import base64
from sagemaker import get_execution_role, session
import numpy as np

import pysbd

from langchain.llms import Bedrock
from langchain.embeddings import BedrockEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms.utils import enforce_stop_tokens

region= boto3.Session().region_name

sm_client = boto3.client('sagemaker')

role = get_execution_role()
print("RoleArn: {}".format(role))

In [None]:
endpoint_name = <provide endpoint name>
default_bucket = <default bucket>
current_endpoint_capture_prefix = ""
s3_key_prefix = ""

In [None]:
!aws s3 sync s3://{default_bucket}/{current_endpoint_capture_prefix} data

## Initialize LLM & util functions

In [None]:
llm = Bedrock(
    model_id="anthropic.claude-v2",
    model_kwargs={"max_tokens_to_sample": 200,
                "temperature": 0},
    client=boto3.client("bedrock-runtime", region_name='us-west-2'),
)


embeddings= BedrockEmbeddings(
    client=boto3.client("bedrock-runtime", region_name='us-west-2'),
)

seg = pysbd.Segmenter(language="en", clean=False)

In [None]:
def base64_to_string(base64_string):
    base64_bytes = base64_string.encode('ascii')
    string_bytes = base64.b64decode(base64_bytes) 
    return string_bytes.decode('utf-8')

def extract_questions(text):
    pattern = r"### Question\n(.*?)\n### Context"
    match = re.search(pattern, text, re.DOTALL)
    if match is None:
        return ""
    return match.group(1)

def extract_answers(text):
    pattern = r"\[\/INST\](.*)"
    match = re.search(pattern, text)
    if match is None:
        return ""
    return match.group(1)   

def extract_contexts(text):
    pattern = r"### Context\n(.*)\[/INST]"
    match = re.search(pattern, text, re.DOTALL)
    if match is None:
        return ""
    return match.group(1)

# Helper function to extract question and answer from dataset
def extract_qac(input_data, output_data):
    question = extract_questions(json.loads(base64_to_string(input_data))["text"])
    print("Question: ", question)
    context = extract_contexts(json.loads(base64_to_string(input_data))["text"])
    print("Context: ", context)
    generated_text = json.loads(base64_to_string(output_data))["outputs"][0]["generated_text"]
    answer = extract_answers(generated_text)
    print("Answer: ", answer)
    return question, answer, context

def sent_tokenize(text):
    """
    tokenizer text into sentences
    """
    sentences = seg.segment(text)
    assert isinstance(sentences, list)
    return sentences

## Answer Relevance

In [None]:
RELEVANCE_TEMPLATE = """\n\nHuman: Generate question for the given answer.\n\nAssistant:Okay, give me an answer, and I will generate a question.
\nHuman:Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India 
\nAssistant:Question:\nWhen is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?
\nHuman:Answer:\n{answer}
\nAssistant:Question:\n
""" 

EVALUATOR = PromptTemplate(template=RELEVANCE_TEMPLATE, input_variables=["answer"])

llm_chain = LLMChain(llm=llm, prompt=EVALUATOR)

In [None]:
infer_dir = "./workspace/data"
questions, answers, contexts = [], [], []

for filepath in pathlib.Path(infer_dir).rglob('*.jsonl'):

    with open(filepath.absolute(), 'r') as f:
        for line in f:
            jsonl = json.loads(line)
            input_data = jsonl['captureData']['endpointInput']['data']
            output_data = jsonl['captureData']['endpointOutput']['data']
            
            q, a, c = extract_qac(input_data, output_data)
            if q != "" and a != "":
                    questions.append(q)
                    answers.append(a)
                    contexts.append(c)

In [None]:
def calculate_similarity(question, generated_questions, embeddings):
    
    question_vec = np.asarray(embeddings.embed_query(question)).reshape(1, -1)
    gen_question_vec = np.asarray(
        embeddings.embed_documents(generated_questions)
    )
    norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(
        question_vec, axis=1
    )
    return (
        np.dot(gen_question_vec, question_vec.T).reshape(
            -1,
        )
        / norm
    )

In [None]:
scores = []

for q, a in zip(questions, answers):
    results = []
    for i in range(5):
        results.append(llm_chain.run(answer=a).strip())
    cosine_sim = calculate_similarity(q, results, embeddings)
    scores.append(cosine_sim.mean())
    
scores


In [None]:
np.mean(scores)

## Faithfulness
...

## Context Precision

In [None]:
CONTEXT_PRECISION_TEMPLATE = """\n\nHuman: Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information".  While extracting candidate sentences you're not allowed to make any changes to sentences from given context.
\nquestion:{question}
\ncontext:\n{context}
\nAssistant: candidate sentences:
""" 

EVALUATOR = PromptTemplate(template=CONTEXT_PRECISION_TEMPLATE, input_variables=["question", "context"])

llm_chain = LLMChain(llm=llm, prompt=EVALUATOR)

In [None]:
def calculate_overlap(context_sent, generated_context):
    overlap_scores = []
    for gc in generated_context:
        indices = (
            sent_tokenize(gc)
            if gc.lower() != "insufficient information."
            else []
        )

        if len(context_sent) == 0:
            score = 0
        else:
            score = min(len(indices) / len(context_sent), 1)
        
        overlap_scores.append(score)
            
    return np.mean(overlap_scores)

In [None]:
scores = []

for q, c in zip(questions, contexts):
    if c != "":
        context_sent = sent_tokenize(c)
        
        results = []
        for i in range(5):
            results.append(llm_chain.run(question=q, context=c).strip())

        score = calculate_overlap(context_sent, results)
        
        scores.append(score)
        
scores

In [None]:
np.mean(scores)