# Automatic Evaluation on a few samples


In [None]:
# cd to "Directory of "./alignment"
%cd ../../
%ls

In [None]:
# !conda activate phd-align-py3-11
import torch

logger.info(torch.cuda.is_available())
logger.info(torch.version.cuda)

In [None]:
sample = {
    "docid": 54251,
    "doc": "There are two types of credit checks. First is the hard pull which is typically done when you apply for a credit line. The lender will hard pull your file and make his/her decision based on that. This affects your score negatively. You might lose few points for one hard inquiry. Second type is soft pull, which is done as a background check. Typically done by credit card companies to send you a pre-approved offer, or renting an apartment etc. This does not affect your score. One thing to keep in mind is a company will not do a hard pull without your permission, where as they can do soft pulls without you even knowing.  Soft inquiries vs hard inquiries",
    "tokenized_size": 140.0,
    "max_seq_len_exceeded": False,
    "cct_saar": {
        "queries_aspects": [
            {
                "question": "What are the two types of credit checks?",
                "answer": "There are two types of credit checks: hard pull and soft pull. A hard pull is typically done when you apply for a credit line, while a soft pull is done as a background check.",
            },
            {
                "question": "How does a hard pull affect your credit score?",
                "answer": "A hard pull can negatively affect your credit score, causing you to lose a few points.",
            },
            {
                "question": "What is the difference between a hard pull and a soft pull?",
                "answer": "A hard pull is a more invasive check that affects your credit score, while a soft pull is a background check that does not affect your score.",
            },
            {
                "question": "Can a company do a hard pull without your permission?",
                "answer": "No, a company cannot do a hard pull without your permission.",
            },
            {
                "question": "What is the purpose of a soft pull?",
                "answer": "A soft pull is typically done as a background check, such as when a credit card company sends you a pre-approved offer or when you're renting an apartment.",
            },
            {
                "question": "What is the main difference between a hard pull and a soft pull?",
                "answer": "The main difference is that a hard pull affects your credit score, while a soft pull does not.",
            },
            {
                "question": "What is the impact of a hard pull on your credit score?",
                "answer": "A hard pull can cause a temporary decrease in your credit score, but the impact is usually minor.",
            },
        ]
    },
}

In [None]:
from aimw.app.utils.json_utils import JSONProcessor

from loguru import logger

In [None]:
r_dir = "./re/data/fiqa/clean/base/exp_001/"
w_dir = ""
jSONProcessor = JSONProcessor(r_dir + "cct_saar_corpus_cln_split_0.json")
data = jSONProcessor.read_json_file()

## SentenceTransformer Using `BAAI/bge-large-en-v1.5`


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-large-en-v1.5")


def get_average_scores(data):
    mean_of_mean_score_context_vs_queries = 0.0
    mean_of_mean_score_context_vs_queries_joint = 0.0
    mean_of_mean_score_context_vs_answers = 0.0
    mean_of_mean_score_context_vs_answers_joint = 0.0
    mean_of_mean_score_questions_answers = 0.0
    mean_of_mean_score_questions_joint_vs_answers_joint = 0.0
    mean_of_mean_score_questions_joint_and_answers_joint_vs_context = 0.0
    mean_of_mean_score_questions_and_answers_vs_context = 0.0
    eval_data_size = 0

    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    number_of_pairs = len(answers)
                    # Embed targets
                    context_embedding = model.encode(context, normalize_embeddings=True)
                    queries_embeddings = model.encode(
                        queries, normalize_embeddings=True
                    )
                    queries_joint_embeddings = model.encode(
                        queries_joint, normalize_embeddings=True
                    )
                    answers_embeddings = model.encode(
                        answers, normalize_embeddings=True
                    )
                    answers_joint_embeddings = model.encode(
                        answers_joint, normalize_embeddings=True
                    )

                    # Compute similarity scores

                    ############# context Vs queries #############
                    similarity_context_queries = (
                        context_embedding @ queries_embeddings.T
                    )
                    logger.info(
                        f"similarity_context_queries: {similarity_context_queries}"
                    )
                    mean_score_context_vs_queries = sum(
                        similarity_context_queries
                    ) / len(
                        similarity_context_queries
                    )  # <----------
                    mean_of_mean_score_context_vs_queries = (
                        mean_of_mean_score_context_vs_queries
                        + mean_score_context_vs_queries
                    )

                    ############# context Vs concatenated queries #############
                    similarity_context_queries_joint = (
                        context_embedding @ queries_joint_embeddings.T
                    )
                    logger.info(
                        f"similarity_context_queries_joint: {similarity_context_queries_joint}"
                    )
                    mean_of_mean_score_context_vs_queries_joint = (
                        mean_of_mean_score_context_vs_queries_joint
                        + similarity_context_queries_joint
                    )

                    ############# context Vs answers #############
                    similarity_context_answers = (
                        context_embedding @ answers_embeddings.T
                    )
                    logger.info(
                        f"similarity_context_answers: {similarity_context_answers}"
                    )
                    mean_score_context_vs_answers = sum(
                        similarity_context_answers
                    ) / len(
                        similarity_context_answers
                    )  # <----------
                    mean_of_mean_score_context_vs_answers = (
                        mean_of_mean_score_context_vs_answers
                        + mean_score_context_vs_answers
                    )
                    ############# context Vs concatenated answers #############
                    similarity_context_answers_joint = (
                        context_embedding @ answers_joint_embeddings.T
                    )
                    logger.info(
                        f"similarity_context_answers_joint: {similarity_context_answers_joint}"
                    )
                    mean_of_mean_score_context_vs_answers_joint = (
                        mean_of_mean_score_context_vs_answers_joint
                        + similarity_context_answers_joint
                    )

                    ################ Questions Vs Answers ################
                    similarity_questions_answers = [
                        x @ y.T for x, y in zip(answers_embeddings, queries_embeddings)
                    ]
                    mean_score_questions_answers = sum(
                        similarity_questions_answers
                    ) / len(
                        similarity_questions_answers
                    )  # <-----------
                    mean_of_mean_score_questions_answers = (
                        mean_of_mean_score_questions_answers
                        + mean_score_questions_answers
                    )
                    logger.info(
                        f"similarty scores: {similarity_questions_answers} | average similarity score: {mean_score_questions_answers}"
                    )

                    ################ Questions Joint Vs Answers Joint ################
                    mean_score_questions_joint_vs_answers_joint = (
                        answers_joint_embeddings @ queries_joint_embeddings.T
                    )  # <----------
                    mean_of_mean_score_questions_joint_vs_answers_joint = (
                        mean_of_mean_score_questions_joint_vs_answers_joint
                        + mean_score_questions_joint_vs_answers_joint
                    )
                    logger.info(
                        f"similarty scores of concatenated Queries Vs Answers: {mean_score_questions_joint_vs_answers_joint}"
                    )

                    ################ (Joint Questions + Joint Answers)/2 Vs Context ################
                    mean_score_questions_joint_and_answers_joint_vs_context = (
                        context_embedding
                        @ ((answers_joint_embeddings + queries_joint_embeddings) / 2).T
                    )  # <----------
                    mean_of_mean_score_questions_joint_and_answers_joint_vs_context = (
                        mean_of_mean_score_questions_joint_and_answers_joint_vs_context
                        + mean_score_questions_joint_and_answers_joint_vs_context
                    )
                    logger.info(
                        f"similarty scores of concatenated Answers embeddings + concatenated Queries embeddings Vs Context: {mean_score_questions_joint_and_answers_joint_vs_context}"
                    )

                    ################ (Questions/n + Answers/n)/2 Answers) Vs Context ################
                    mean_answers_embeddings = sum(answers_embeddings) / len(
                        answers_embeddings
                    )
                    mean_questions_embeddings = sum(queries_embeddings) / len(
                        queries_embeddings
                    )
                    mean_score_questions_and_answers_vs_context = (
                        context_embedding
                        @ ((mean_answers_embeddings + mean_questions_embeddings) / 2).T
                    )  # <----------
                    mean_of_mean_score_questions_and_answers_vs_context = (
                        mean_of_mean_score_questions_and_answers_vs_context
                        + mean_score_questions_and_answers_vs_context
                    )
                    logger.info(
                        f"similarty scores of mean answers,questions embeddings Vs Context: {mean_score_questions_and_answers_vs_context}"
                    )

    return (
        eval_data_size,
        (
            mean_of_mean_score_context_vs_queries / eval_data_size,
            mean_of_mean_score_context_vs_queries_joint / eval_data_size,
            mean_of_mean_score_context_vs_answers / eval_data_size,
            mean_of_mean_score_context_vs_answers_joint / eval_data_size,
            mean_of_mean_score_questions_answers / eval_data_size,
            mean_of_mean_score_questions_joint_vs_answers_joint / eval_data_size,
            mean_of_mean_score_questions_joint_and_answers_joint_vs_context
            / eval_data_size,
            mean_of_mean_score_questions_and_answers_vs_context / eval_data_size,
        ),
    )


# mean_of_mean_score_context_vs_queries = mean_of_mean_score_context_vs_queries / eval_data_size
# mean_of_mean_score_context_vs_queries_joint = mean_of_mean_score_context_vs_queries_joint / eval_data_size
# mean_of_mean_score_context_vs_answers = mean_of_mean_score_context_vs_answers / eval_data_size
# mean_of_mean_score_context_vs_answers_joint = mean_of_mean_score_context_vs_answers_joint / eval_data_size
# mean_of_mean_score_questions_answers = mean_of_mean_score_questions_answers / eval_data_size
# mean_of_mean_score_questions_joint_vs_answers_joint = mean_of_mean_score_questions_joint_vs_answers_joint / eval_data_size
# mean_of_mean_score_questions_joint_and_answers_joint_vs_context = mean_of_mean_score_questions_joint_and_answers_joint_vs_context / eval_data_size
# mean_of_mean_score_questions_and_answers_vs_context = mean_of_mean_score_questions_and_answers_vs_context / eval_data_size

In [None]:
get_average_scores(data)

## BERTScore


In [None]:
import bert_score
from bert_score import score

bert_score.__version__

# hide the loading messages
import logging
import transformers

transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)


def get_average_BERTscores(data):
    mean_P_aj_c, mean_R_aj_c, mean_F1_aj_c = 0.0, 0.0, 0.0
    mean_P_qj_c, mean_R_qj_c, mean_F1_qj_c = 0.0, 0.0, 0.0
    mean_P_qj_aj, mean_R_qj_aj, mean_F1_qj_aj = 0.0, 0.0, 0.0
    eval_data_size = 0
    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    # Between Answers
                    P_aj_c, R_aj_c, F1_aj_c = score(
                        [answers_joint], [context], lang="en", verbose=True
                    )
                    P_qj_c, R_qj_c, F1_qj_c = score(
                        [queries_joint], [context], lang="en", verbose=True
                    )
                    P_qj_aj, R_qj_aj, F1_qj_aj = score(
                        [queries_joint], [answers_joint], lang="en", verbose=True
                    )

                    mean_P_aj_c = mean_P_aj_c + P_aj_c
                    mean_R_aj_c = mean_R_aj_c + R_aj_c
                    mean_F1_aj_c = mean_F1_aj_c + F1_aj_c
                    mean_P_qj_c = mean_P_qj_c + P_qj_c
                    mean_R_qj_c = mean_R_qj_c + R_qj_c
                    mean_F1_qj_c = mean_F1_qj_c + F1_qj_c
                    mean_P_qj_aj = mean_P_qj_aj + P_qj_aj
                    mean_R_qj_aj = mean_R_qj_aj + R_qj_aj
                    mean_F1_qj_aj = mean_F1_qj_aj + F1_qj_aj

    return (
        eval_data_size,
        (
            mean_P_aj_c / eval_data_size,
            mean_R_aj_c / eval_data_size,
            mean_F1_aj_c / eval_data_size,
        ),
        (
            mean_P_qj_c / eval_data_size,
            mean_R_qj_c / eval_data_size,
            mean_F1_qj_c / eval_data_size,
        ),
        (
            mean_P_qj_aj / eval_data_size,
            mean_R_qj_aj / eval_data_size,
            mean_F1_qj_aj / eval_data_size,
        ),
    )

In [None]:
get_average_BERTscores(data)

## Vendi Score for diversity

If diversity between concatned answers and the context is low (close to 1), this means comprehensivness and less hallucination.


In [None]:
from vendi_score import text_utils


def get_average_VENDIscores(data):
    mean_ngram_vs_c_aj = 0.0
    mean_ngram_vs_c_qj = 0.0
    mean_ngram_vs_qj_aj = 0.0
    mean_bert_vs_c_aj = 0.0
    mean_bert_vs_c_qj = 0.0
    mean_bert_vs_qj_aj = 0.0
    mean_simcse_vs_c_aj = 0.0
    mean_simcse_vs_c_qj = 0.0
    mean_simcse_vs_qj_aj = 0.0
    eval_data_size = 0
    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    c_aj = [context, answers_joint]
                    ngram_vs_c_aj = text_utils.ngram_vendi_score(sents=c_aj, ns=[1, 2])
                    bert_vs_c_aj = text_utils.embedding_vendi_score(
                        sents=c_aj, model_path="bert-base-uncased"
                    )
                    simcse_vs_c_aj = text_utils.embedding_vendi_score(
                        sents=c_aj,
                        model_path="princeton-nlp/unsup-simcse-bert-base-uncased",
                    )

                    c_qj = [context, queries_joint]
                    ngram_vs_c_qj = text_utils.ngram_vendi_score(sents=c_qj, ns=[1, 2])
                    bert_vs_c_qj = text_utils.embedding_vendi_score(
                        sents=c_qj, model_path="bert-base-uncased"
                    )
                    simcse_vs_c_qj = text_utils.embedding_vendi_score(
                        sents=c_qj,
                        model_path="princeton-nlp/unsup-simcse-bert-base-uncased",
                    )

                    qj_aj = [queries_joint, answers_joint]
                    ngram_vs_qj_aj = text_utils.ngram_vendi_score(
                        sents=qj_aj, ns=[1, 2]
                    )
                    bert_vs_qj_aj = text_utils.embedding_vendi_score(
                        sents=qj_aj, model_path="bert-base-uncased"
                    )
                    simcse_vs_qj_aj = text_utils.embedding_vendi_score(
                        sents=qj_aj,
                        model_path="princeton-nlp/unsup-simcse-bert-base-uncased",
                    )

                    mean_ngram_vs_c_aj = mean_ngram_vs_c_aj + ngram_vs_c_aj
                    mean_ngram_vs_c_qj = mean_ngram_vs_c_qj + ngram_vs_c_qj
                    mean_ngram_vs_qj_aj = mean_ngram_vs_qj_aj + ngram_vs_qj_aj
                    mean_bert_vs_c_aj = mean_bert_vs_c_aj + bert_vs_c_aj
                    mean_bert_vs_c_qj = mean_bert_vs_c_qj + bert_vs_c_qj
                    mean_bert_vs_qj_aj = mean_bert_vs_qj_aj + bert_vs_qj_aj
                    mean_simcse_vs_c_aj = mean_simcse_vs_c_aj + simcse_vs_c_aj
                    mean_simcse_vs_c_qj = mean_simcse_vs_c_qj + simcse_vs_c_qj
                    mean_simcse_vs_qj_aj = mean_simcse_vs_qj_aj + simcse_vs_qj_aj

    return (
        eval_data_size,
        (
            mean_ngram_vs_c_aj / eval_data_size,
            mean_ngram_vs_c_qj / eval_data_size,
            mean_ngram_vs_qj_aj / eval_data_size,
        ),
        (
            mean_bert_vs_c_aj / eval_data_size,
            mean_bert_vs_c_qj / eval_data_size,
            mean_bert_vs_qj_aj / eval_data_size,
        ),
        (
            mean_simcse_vs_c_aj / eval_data_size,
            mean_simcse_vs_c_qj / eval_data_size,
            mean_simcse_vs_qj_aj / eval_data_size,
        ),
    )

In [None]:
import warnings

warnings.simplefilter("ignore", UserWarning)
get_average_VENDIscores(data)

## ROUGE


In [None]:
from rouge import Rouge


def get_average_RougeScores(data):
    mean_rougeL_qj_c = [{"rouge-l": {"r": 0.0, "p": 0.0, "f": 0.0}}]
    mean_rougeL_aj_c = [{"rouge-l": {"r": 0.0, "p": 0.0, "f": 0.0}}]
    mean_rougeL_qj_aj = [{"rouge-l": {"r": 0.0, "p": 0.0, "f": 0.0}}]

    eval_data_size = 0
    rouge_evaluator = Rouge(metrics=["rouge-l"])
    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    # [{'rouge-l': {'r': 0.2235294117647059, 'p': 0.6129032258064516, 'f': 0.3275862029800833}}]
                    rougeL_qj_c = rouge_evaluator.get_scores(
                        hyps=queries_joint, refs=context
                    )[0]
                    rougeL_aj_c = rouge_evaluator.get_scores(
                        hyps=answers_joint, refs=context
                    )[0]
                    rougeL_qj_aj = rouge_evaluator.get_scores(
                        hyps=queries_joint, refs=answers_joint
                    )[0]

                    mean_rougeL_qj_c[0]["rouge-l"]["r"] = (
                        mean_rougeL_qj_c[0]["rouge-l"]["r"]
                        + rougeL_qj_c["rouge-l"]["r"]
                    )
                    mean_rougeL_qj_c[0]["rouge-l"]["p"] = (
                        mean_rougeL_qj_c[0]["rouge-l"]["p"]
                        + rougeL_qj_c["rouge-l"]["f"]
                    )
                    mean_rougeL_qj_c[0]["rouge-l"]["f"] = (
                        mean_rougeL_qj_c[0]["rouge-l"]["f"]
                        + rougeL_qj_c["rouge-l"]["f"]
                    )

                    mean_rougeL_aj_c[0]["rouge-l"]["r"] = (
                        mean_rougeL_aj_c[0]["rouge-l"]["r"]
                        + rougeL_aj_c["rouge-l"]["r"]
                    )
                    mean_rougeL_aj_c[0]["rouge-l"]["p"] = (
                        mean_rougeL_aj_c[0]["rouge-l"]["p"]
                        + rougeL_aj_c["rouge-l"]["f"]
                    )
                    mean_rougeL_aj_c[0]["rouge-l"]["f"] = (
                        mean_rougeL_aj_c[0]["rouge-l"]["f"]
                        + rougeL_aj_c["rouge-l"]["f"]
                    )

                    mean_rougeL_qj_aj[0]["rouge-l"]["r"] = (
                        mean_rougeL_qj_aj[0]["rouge-l"]["r"]
                        + rougeL_qj_aj["rouge-l"]["r"]
                    )
                    mean_rougeL_qj_aj[0]["rouge-l"]["p"] = (
                        mean_rougeL_qj_aj[0]["rouge-l"]["p"]
                        + rougeL_qj_aj["rouge-l"]["f"]
                    )
                    mean_rougeL_qj_aj[0]["rouge-l"]["f"] = (
                        mean_rougeL_qj_aj[0]["rouge-l"]["f"]
                        + rougeL_qj_aj["rouge-l"]["f"]
                    )

    mean_rougeL_qj_c[0]["rouge-l"]["r"] = (
        mean_rougeL_qj_c[0]["rouge-l"]["r"] / eval_data_size
    )
    mean_rougeL_qj_c[0]["rouge-l"]["p"] = (
        mean_rougeL_qj_c[0]["rouge-l"]["p"] / eval_data_size
    )
    mean_rougeL_qj_c[0]["rouge-l"]["f"] = (
        mean_rougeL_qj_c[0]["rouge-l"]["f"] / eval_data_size
    )

    mean_rougeL_aj_c[0]["rouge-l"]["r"] = (
        mean_rougeL_aj_c[0]["rouge-l"]["r"] / eval_data_size
    )
    mean_rougeL_aj_c[0]["rouge-l"]["p"] = (
        mean_rougeL_aj_c[0]["rouge-l"]["p"] / eval_data_size
    )
    mean_rougeL_aj_c[0]["rouge-l"]["f"] = (
        mean_rougeL_aj_c[0]["rouge-l"]["f"] / eval_data_size
    )

    mean_rougeL_qj_aj[0]["rouge-l"]["r"] = (
        mean_rougeL_qj_aj[0]["rouge-l"]["r"] / eval_data_size
    )
    mean_rougeL_qj_aj[0]["rouge-l"]["p"] = (
        mean_rougeL_qj_aj[0]["rouge-l"]["p"] / eval_data_size
    )
    mean_rougeL_qj_aj[0]["rouge-l"]["f"] = (
        mean_rougeL_qj_aj[0]["rouge-l"]["f"] / eval_data_size
    )

    return (eval_data_size, mean_rougeL_qj_c, mean_rougeL_aj_c, mean_rougeL_qj_aj)

In [None]:
get_average_RougeScores(data)

## BLEU


In [None]:
from nltk.translate import bleu, bleu_score


def get_average_BlueScores(data):
    mean_bleu_aj_c = 0.0
    mean_bleu_qj_c = 0.0
    mean_bleu_qj_aj = 0.0

    eval_data_size = 0

    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    bleu_aj_c = bleu(
                        references=[context.split()],
                        hypothesis=answers_joint.split(),
                        weights=(1,),
                    )
                    bleu_qj_c = bleu(
                        references=[context.split()],
                        hypothesis=queries_joint.split(),
                        weights=(1,),
                    )
                    bleu_qj_aj = bleu(
                        references=[answers_joint.split()],
                        hypothesis=queries_joint.split(),
                        weights=(1,),
                    )

                    mean_bleu_aj_c = mean_bleu_aj_c + bleu_aj_c
                    mean_bleu_qj_c = mean_bleu_qj_c + bleu_qj_c
                    mean_bleu_qj_aj = mean_bleu_qj_aj + bleu_qj_aj

    mean_bleu_aj_c = mean_bleu_aj_c / eval_data_size
    mean_bleu_qj_c = mean_bleu_qj_c / eval_data_size
    mean_bleu_qj_aj = mean_bleu_qj_aj / eval_data_size

    return (eval_data_size, (mean_bleu_aj_c, mean_bleu_qj_c, mean_bleu_qj_aj))

In [None]:
get_average_BlueScores(data)

## METEOR


In [None]:
from nltk.translate import meteor
from nltk import word_tokenize


def get_average_MeteorScores(data):

    mean_meteor_q_a = 0.0
    mean_meteor_q_c = 0.0
    mean_meteor_a_c = 0.0
    mean_meteor_aj_c = 0.0
    mean_meteor_qj_c = 0.0
    mean_meteor_qj_aj = 0.0

    eval_data_size = 0

    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    meteor_aj_c = meteor(
                        references=[word_tokenize(context)],
                        hypothesis=word_tokenize(answers_joint),
                    )
                    meteor_qj_c = meteor(
                        references=[word_tokenize(context)],
                        hypothesis=word_tokenize(queries_joint),
                    )
                    meteor_qj_aj = meteor(
                        references=[word_tokenize(answers_joint)],
                        hypothesis=word_tokenize(queries_joint),
                    )

                    mean_meteor_aj_c = mean_meteor_aj_c + meteor_aj_c
                    mean_meteor_qj_c = mean_meteor_qj_c + meteor_qj_c
                    mean_meteor_qj_aj = mean_meteor_qj_aj + meteor_qj_aj

                    mean_a_q = 0.0
                    mean_c_q = 0.0
                    mean_c_a = 0.0
                    for q, a in zip(queries, answers):
                        a_q = meteor(
                            references=[word_tokenize(q)], hypothesis=word_tokenize(a)
                        )
                        c_q = meteor(
                            references=[word_tokenize(context)],
                            hypothesis=word_tokenize(q),
                        )
                        c_a = meteor(
                            references=[word_tokenize(context)],
                            hypothesis=word_tokenize(a),
                        )
                        mean_a_q = mean_a_q + a_q
                        mean_c_q = mean_c_q + c_q
                        mean_c_a = mean_c_a + c_a

                    mean_a_q = mean_a_q / len(queries)
                    mean_c_q = mean_c_q / len(queries)
                    mean_c_a = mean_c_a / len(queries)

                    mean_meteor_q_a = mean_meteor_q_a + mean_a_q
                    mean_meteor_q_c = mean_meteor_q_c + mean_c_q
                    mean_meteor_a_c = mean_meteor_a_c + mean_c_a

    mean_meteor_q_a = mean_meteor_q_a / eval_data_size
    mean_meteor_q_c = mean_meteor_q_c / eval_data_size
    mean_meteor_a_c = mean_meteor_a_c / eval_data_size

    mean_meteor_aj_c = mean_meteor_aj_c / eval_data_size
    mean_meteor_qj_c = mean_meteor_qj_c / eval_data_size
    mean_meteor_qj_aj = mean_meteor_qj_aj / eval_data_size

    return (
        eval_data_size,
        (mean_meteor_a_c, mean_meteor_q_c, mean_meteor_q_a),
        (mean_meteor_aj_c, mean_meteor_qj_c, mean_meteor_qj_aj),
    )

In [None]:
get_average_MeteorScores(data)

## Jaccard Index


In [None]:
def jaccard_similarity(ref, hyp):
    """returns the jaccard similarity between two lists"""
    intersection_cardinality = len(set.intersection(*[set(ref), set(hyp)]))
    union_cardinality = len(set.union(*[set(ref), set(hyp)]))
    return intersection_cardinality / float(union_cardinality)


def get_average_JaccardScores(data):
    mean_aj_c = 0.0
    mean_qj_c = 0.0
    mean_qj_aj = 0.0

    eval_data_size = 0

    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    aj_c = jaccard_similarity(context, answers_joint)
                    qj_c = jaccard_similarity(context, queries_joint)
                    qj_aj = jaccard_similarity(answers_joint, queries_joint)

                    mean_aj_c = mean_aj_c + aj_c
                    mean_qj_c = mean_qj_c + qj_c
                    mean_qj_aj = mean_qj_aj + qj_aj

    mean_aj_c = mean_aj_c / eval_data_size
    mean_qj_c = mean_qj_c / eval_data_size
    mean_qj_aj = mean_qj_aj / eval_data_size

    return (eval_data_size, (mean_aj_c, mean_qj_c, mean_qj_aj))

In [None]:
get_average_JaccardScores(data)

## Distinct-n


In [None]:
from aimw.app.services.eval.distinct_n import (
    distinct_n_sentence_level,
    distinct_n_corpus_level,

)


def jaccard_similarity(ref, hyp):

    """returns the jaccard similarity between two lists"""

    intersection_cardinality = len(set.intersection(*[set(ref), set(hyp)]))

    union_cardinality = len(set.union(*[set(ref), set(hyp)]))

    return intersection_cardinality / float(union_cardinality)




def get_average_DistinctNScores(data):

    mean_a_dist1 = 0.0
    mean_q_dist1 = 0.0
    mean_a_dist2 = 0.0
    mean_q_dist2 = 0.0
    mean_aj_dist1 = 0.0
    mean_qj_dist1 = 0.0
    mean_aj_dist2 = 0.0
    mean_qj_dist2 = 0.0
    eval_data_size = 0

    for i, doc in enumerate(data):  # data:

        logger.info(f"docid: {doc['docid']}")

        if "cct_saar" in doc.keys():

            if "queries_aspects" in doc["cct_saar"].keys():

                if len(doc["cct_saar"]["queries_aspects"]) > 0:

                    # Extract targets

                    eval_data_size = eval_data_size + 1

                    context = doc["doc"]

                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]

                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    a_dist1 = distinct_n_corpus_level(answers, 1)
                    q_dist1 = distinct_n_corpus_level(queries, 1)

                    a_dist2 = distinct_n_corpus_level(answers, 2)
                    q_dist2 = distinct_n_corpus_level(queries, 2)

                    aj_dist1 = distinct_n_sentence_level(answers_joint, 1)
                    qj_dist1 = distinct_n_sentence_level(queries_joint, 1)

                    aj_dist2 = distinct_n_sentence_level(answers_joint, 2)
                    qj_dist2 = distinct_n_sentence_level(queries_joint, 2)

                    mean_a_dist1 = mean_a_dist1 + a_dist1
                    mean_q_dist1 = mean_q_dist1 + q_dist1
                    mean_a_dist2 = mean_a_dist2 + a_dist2
                    mean_q_dist2 = mean_q_dist2 + q_dist2
                    mean_aj_dist1 = mean_aj_dist1 + aj_dist1
                    mean_qj_dist1 = mean_qj_dist1 + qj_dist1
                    mean_aj_dist2 = mean_aj_dist2 + aj_dist2
                    mean_qj_dist2 = mean_qj_dist2 + qj_dist2

    mean_a_dist1 = mean_a_dist1 / eval_data_size
    mean_q_dist1 = mean_q_dist1 / eval_data_size
    mean_a_dist2 = mean_a_dist2 / eval_data_size
    mean_q_dist2 = mean_q_dist2 / eval_data_size
    mean_aj_dist1 = mean_aj_dist1 / eval_data_size
    mean_qj_dist1 = mean_qj_dist1 / eval_data_size
    mean_aj_dist2 = mean_aj_dist2 / eval_data_size
    mean_qj_dist2 = mean_qj_dist2 / eval_data_size

    return (
        eval_data_size,
        (
            mean_a_dist1,
            mean_q_dist1,
            mean_a_dist2,
            mean_q_dist2,
            mean_aj_dist1,
            mean_qj_dist1,
            mean_aj_dist2,
            mean_qj_dist2,
        ),

    )

In [None]:
average_DistinctNScores = get_average_DistinctNScores(data)
average_DistinctNScores

## G-Eval / DeepEval - Hallucinations


In [None]:
import os
from aimw.app.core.ai_config import get_ai_settings

os.environ["OPENAI_API_KEY"] = (
    get_ai_settings().openai_api_key
)

os.environ["DEEPEVAL_RESULTS_FOLDER"] = "./output/deepeval/hallucination"

# DeepEval Available GPT models: gpt-4o, gpt-4-turbo, gpt-4-turbo-preview, gpt-4-0125-preview, gpt-4-1106-preview, gpt-4, gpt-4-32k, gpt-4-0613, gpt-4-32k-0613, gpt-3.5-turbo-1106, gpt-3.5-turbo, gpt-3.5-turbo-16k, gpt-3.5-turbo-0125
model_name = "gpt-3.5-turbo-0125"

In [None]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase


def get_average_HallucinationScores(data):
    mean_score = 0.0

    eval_data_size = 0

    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    test_case = LLMTestCase(
                        input=queries_joint,
                        actual_output=answers_joint,
                        context=[context],
                    )
                    metric = HallucinationMetric(threshold=0.5, model=model_name)

                    metric.measure(test_case)

                    logger.info(metric.score)
                    logger.info(metric.reason)
                    logger.info

                    mean_score = mean_score + metric.score
                    # or evaluate test cases in bulk
                    # evaluate([test_case], [metric])

    mean_score = mean_score / eval_data_size

    return (eval_data_size, mean_score)

In [None]:
get_average_HallucinationScores(data)

## DeepEval Contextual Relevancy


In [None]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase


def get_average_ContextualRelevancyScores(data):
    mean_score = 0.0

    eval_data_size = 0

    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    metric = ContextualRelevancyMetric(
                        threshold=0.7, model=model_name, include_reason=True
                    )
                    test_case = LLMTestCase(
                        input=queries_joint,
                        actual_output=answers_joint,
                        retrieval_context=[context],
                    )

                    metric.measure(test_case)
                    logger.info(metric.score)
                    logger.info(metric.reason)

                    mean_score = mean_score + metric.score
                    # or evaluate test cases in bulk
                    # evaluate([test_case], [metric])

    mean_score = mean_score / eval_data_size

    return (eval_data_size, mean_score)

In [None]:
contextualRelevancyScores = get_average_ContextualRelevancyScores(data)
contextualRelevancyScores

## DeepEval Faithfulness


In [None]:
from deepeval import evaluate
from deepeval.metrics import (
    HallucinationMetric,
    ContextualRelevancyMetric,
    FaithfulnessMetric,
)
from deepeval.test_case import LLMTestCase


def get_average_FaithfulnessMetric(data):
    mean_score = 0.0

    eval_data_size = 0

    for i, doc in enumerate(data):  # data:
        logger.info(f"docid: {doc['docid']}")
        # if doc["docid"] == 313017:
        #     logger.info(doc)
        if "cct_saar" in doc.keys():
            if "queries_aspects" in doc["cct_saar"].keys():
                if len(doc["cct_saar"]["queries_aspects"]) > 0:
                    # Extract targets
                    eval_data_size = eval_data_size + 1
                    context = doc["doc"]
                    queries = [
                        q["question"] for q in doc["cct_saar"]["queries_aspects"]
                    ]
                    answers = [a["answer"] for a in doc["cct_saar"]["queries_aspects"]]
                    queries_joint = " ".join(queries)
                    answers_joint = " ".join(answers)

                    metric = FaithfulnessMetric(
                        threshold=0.7, model=model_name, include_reason=True
                    )
                    test_case = LLMTestCase(
                        input=queries_joint,
                        actual_output=answers_joint,
                        retrieval_context=[context],
                    )

                    metric.measure(test_case)
                    logger.info(metric.score)
                    logger.info(metric.reason)

                    mean_score = mean_score + metric.score
                    # or evaluate test cases in bulk
                    # evaluate([test_case], [metric])

    mean_score = mean_score / eval_data_size

    return (eval_data_size, mean_score)

In [None]:
faithfulnessMetric = get_average_FaithfulnessMetric(data)
faithfulnessMetric