Prepare pip libraries

In [11]:
%%capture
%pip install -U rouge-metric
%pip install -U bert-score
%pip install -U pyemd
%pip install -U nltk
%pip install -U prettytable
%pip install -U numpy==1.25.0

In [12]:
import nltk
nltk.download('punkt_tab', quiet=True)

True

Import libraries

In [13]:
import os
from os import listdir
from os.path import isfile, join
import json

Import metrics

In [14]:
import numpy
print("np.dtypes exists?", hasattr(numpy, 'dtypes'))

np.dtypes exists? True


In [15]:
from rouge_metric import PyRouge
from bert_score import BERTScorer
from Metrics.UniEval.utils import convert_to_json
from Metrics.UniEval.metric.evaluator import get_evaluator
from Metrics.BARTScore.bart_score import BARTScorer
from Metrics.MoverScore import moverscore_v2
from Metrics.MoverScore import moverscore

In [16]:
def GetHypothesisAndReference(path: str, file, rag = False):
    if not os.path.exists(path):
        os.makedirs(path)
    model = ''

    fullPath = path + '/' + file
    with open(fullPath, encoding='utf-8') as f:
        data = json.load(f)

    model = data['model'] + ';' + str(data['params']) + 'b'

    hyps = []
    refsROUGE = []
    refsBERT = []

    for d in data['Qs&As']:
        if not(rag) or (rag and d['RAG']):
            hyps.append(d['answer'])
            refsROUGE.append([d['gold']])
            refsBERT.append(d['gold'])

    return hyps, refsROUGE, refsBERT, model


In [17]:
def FixRougeScores(ROUGEscores: dict):
    for metric in ROUGEscores.keys():
        ROUGEscores[metric]['r'] = round(ROUGEscores[metric]['r'], 4)
        ROUGEscores[metric]['p'] = round(ROUGEscores[metric]['p'], 4)
        ROUGEscores[metric]['f'] = round(ROUGEscores[metric]['f'], 4)

def GetROUGEforSeparateSentences(hyps, refs):
    rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
        rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)
    if len(hyps) == len(refs):
        for i in range(len(hyps)):
            hyp = [hyps[i]]
            ref = [refs[i]]
            scores = rouge.evaluate(hyp, ref)
            finalScores = {
                'ROUGE-1': scores['rouge-1']['r'],
                'ROUGE-2': scores['rouge-2']['f'],
                'ROUGE-4': scores['rouge-4']['f'],
                'ROUGE-L': scores['rouge-l']['f'],
                'ROUGE-W': scores['rouge-w-1.2']['f'],
                'ROUGE-S': scores['rouge-s4']['f'],
                'ROUGE-SU': scores['rouge-su4']['f'],
            }
            if finalScores['ROUGE-L'] > 0.2:
                print(i, finalScores)
    del rouge

def GetTrueRougeScores(scores):
    return {
        'ROUGE-1': round(scores['rouge-1']['r'], 4),
        'ROUGE-2': round(scores['rouge-2']['f'], 4),
        'ROUGE-4': round(scores['rouge-4']['f'], 4),
        'ROUGE-L': round(scores['rouge-l']['f'], 4),
        'ROUGE-W': round(scores['rouge-w-1.2']['f'], 4),
        'ROUGE-S': round(scores['rouge-s4']['f'], 4),
        'ROUGE-SU': round(scores['rouge-su4']['f'], 4)
    }

def GetRougeScores(hyps, refs, metrics):
    rouge_n = [int(metric[6:]) for metric in metrics if metric.startswith('ROUGE')] if 'ROUGE' not in metrics else [1,2,4]
    rouge_l = 'ROUGE-L' in metrics or 'ROUGE' in metrics
    rouge_w = 'ROUGE-W' in metrics or 'ROUGE' in metrics
    rouge_s = 'ROUGE-S' in metrics or 'ROUGE' in metrics
    rouge_su = 'ROUGE-SU' in metrics or 'ROUGE' in metrics
    rouge = PyRouge(rouge_n=rouge_n, rouge_l=rouge_l, rouge_w=rouge_w,
        rouge_w_weight=1.2, rouge_s=rouge_s, rouge_su=rouge_su, skip_gap=4)

    ROUGEscores = rouge.evaluate(hyps, refs)
    del rouge
    return GetTrueRougeScores(ROUGEscores)

def GetBartScore(hyps, refs):
    bart = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

    hyps_lists = [[item] for item in hyps]
    refs_lists = [[item] for item in refs]

    BARTscoresR = bart.multi_ref_score(hyps, refs_lists, batch_size=4)
    BARTscoresP = bart.multi_ref_score(refs, hyps_lists, batch_size=4)

    TotalBARTscoreR = round(sum(BARTscoresR)/len(BARTscoresR),4)
    TotalBARTscoreP = round(sum(BARTscoresP)/len(BARTscoresP),4)
    del bart

    TotalBARTScoreF = (2 * TotalBARTscoreR * TotalBARTscoreP) / (TotalBARTscoreR + TotalBARTscoreP)
    return round(TotalBARTScoreF,4)

def GetBertScore(hyps, refs):
    bert = BERTScorer(lang='lv')

    P_bert, R_bert, F1_bert = bert.score(hyps, refs, verbose=False)
    del bert
    return round(F1_bert.mean().item(),4)

def GetUniEvalScores(hyps, refs):
    unieval = get_evaluator('fact')
    UniEvalData = convert_to_json(output_list=hyps, src_list=refs)

    UniEvalScores = unieval.evaluate(UniEvalData)
    del unieval
    return round(sum( [score['consistency'] for score in UniEvalScores])/len(UniEvalScores),4)

def GetMoverScore(hyps, refs):
    idf_dict_hyp = moverscore.get_idf_dict(hyps)
    idf_dict_ref = moverscore.get_idf_dict(refs)

    MoverScores = moverscore.word_mover_score(refs, hyps, idf_dict_ref, idf_dict_hyp, batch_size=64)
    return round(sum(MoverScores)/len(MoverScores),4)

def GetMoverScoreV2(hyps, refs):
    idf_dict_hyp = moverscore_v2.get_idf_dict(hyps)
    idf_dict_ref = moverscore_v2.get_idf_dict(refs)

    MoverScores = moverscore_v2.word_mover_score(refs, hyps, idf_dict_ref, idf_dict_hyp, batch_size=64)
    return round(sum(MoverScores)/len(MoverScores),4)

In [18]:
def GetMissingMetrics(scores):
    metric_list = ['ROUGE', 'BERTScore', 'BARTScore', 'MoverScore', 'MoverScoreV2', 'UniEval']
    rouge_list = ['ROUGE-1', 'ROUGE-2', 'ROUGE-4', 'ROUGE-L', 'ROUGE-W', 'ROUGE-S', 'ROUGE-SU']

    result = []

    for key in metric_list:
        if key not in scores.keys(): result.append(key)

    if 'ROUGE' in scores.keys():
        for key in rouge_list:
            if key not in scores['ROUGE'].keys(): result.append(key)

    return result

In [19]:
import logging
import warnings
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

def CalculateScores(path, file, metrics = [], get_missing_metrics=True, rag = False):
    scoresPath = 'scores/'
    # refsROUGE - list of lists, refsBERT - list of strings
    hyps, refsROUGE, refsBERT, model = GetHypothesisAndReference(path, file, rag)

    model = model.replace(':','_').replace('/',';')
    print(file)
    
    scoresFile = f'scores_{model}_{file}'
    scoresFile = 'RAG_' + scoresFile if rag else scoresFile
    savePath = scoresPath+scoresFile

    try:
        with open(savePath, encoding='utf-8') as f:
            result = json.load(f)
    except:
        result = {
            'model': model
        }

    print(result)

    if get_missing_metrics:
        metrics = GetMissingMetrics(result)

    if any([metric.startswith('ROUGE') for metric in metrics]):
        print('Calculating ROUGE')
        result['ROUGE'] = GetRougeScores(hyps, refsROUGE, metrics)
    if 'BARTScore' in metrics:
        print('Calculating BARTScore')
        result['BARTScore'] = GetBartScore(hyps, refsBERT)
    if 'BERTScore' in metrics:
        print('Calculating BERTScore')
        result['BERTScore'] = GetBertScore(hyps, refsBERT)
    if 'MoverScore' in metrics:
        print('Calculating MoverScore')
        result['MoverScore'] = GetMoverScore(hyps, refsBERT)
    if 'MoverScoreV2' in metrics:
        print('Calculating MoverScoreV2')
        result['MoverScoreV2'] = GetMoverScoreV2(hyps, refsBERT)
    if 'UniEval' in metrics:
        print('Calculating UniEval')
        result['UniEval'] = GetUniEvalScores(hyps, refsBERT)

    with open(savePath, 'wt', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=4)

    # GetROUGEforSeparateSentences(hyps, refsROUGE)

In [21]:
# path = '/content'
path = 'ModelResponses'

onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]

for file in onlyfiles:
    CalculateScores(path,file,rag=True)

4096_CustomRAG_RAG_results_EuroLLM-9B-Instruct-LatLeg-14.9K-F16.json
{'model': 'EuroLLM-9B-Instruct-LatLeg-14.9K-F16;9b', 'ROUGE': {'ROUGE-1': 0.2994, 'ROUGE-2': 0.049, 'ROUGE-4': 0.0098, 'ROUGE-L': 0.167, 'ROUGE-W': 0.0704, 'ROUGE-S': 0.0335, 'ROUGE-SU': 0.0677}, 'BARTScore': -4.1077, 'BERTScore': 0.7072, 'MoverScore': 0.382, 'MoverScoreV2': 0.2378, 'UniEval': 0.596}
4096_CustomRAG_RAG_results_EuroLLM-9B-Instruct-LatLeg-14.9K-Q4_K_M.json
{'model': 'EuroLLM-9B-Instruct-LatLeg-14.9K-Q4_K_M;9b', 'ROUGE': {'ROUGE-1': 0.3207, 'ROUGE-2': 0.0408, 'ROUGE-4': 0.006, 'ROUGE-L': 0.153, 'ROUGE-W': 0.0661, 'ROUGE-S': 0.0277, 'ROUGE-SU': 0.0596}, 'BARTScore': -4.0704, 'BERTScore': 0.7093, 'MoverScore': 0.3894, 'MoverScoreV2': 0.2452, 'UniEval': 0.6053}
4096_CustomRAG_RAG_results_EuroLLM-9B-Instruct-LatLeg-32.4K-F16.json
{'model': 'EuroLLM-9B-Instruct-LatLeg-32.4K-F16;9b', 'ROUGE': {'ROUGE-1': 0.3334, 'ROUGE-2': 0.0375, 'ROUGE-4': 0.0061, 'ROUGE-L': 0.1512, 'ROUGE-W': 0.0663, 'ROUGE-S': 0.0279, 'ROU