In [None]:

import bert_score
import evaluate
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device being used:", device)

In [None]:

bleurt = evaluate.load("bleurt", module_type='metric')

In [None]:
import ast
import pandas as pd

bleu = evaluate.load('bleu')

def icl_results(icl_method, ds_name, model_name):
    results_data=[]
    with open(f'icl_results/outputs/{icl_method}_{ds_name}_{model_name}.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        print(len(lines))
        i=0
        while i < len(lines)-1: 
            preds = ast.literal_eval(lines[i+1][11:])
            reals = ast.literal_eval(lines[i][11:])
            print(reals)
            print(preds)
            P, R, F1 = bert_score.score(preds, reals, lang="en")
            average_F1 = sum(F1) / len(F1)
            # bert_scores.append(average_F1)
            refs = [[r] for r in reals]
            bleu_score = bleu.compute(predictions=preds, references=refs, max_order=1)
            bleu_score2 = bleu.compute(predictions=preds, references=refs, max_order = 2)
            bleu_score4 = bleu.compute(predictions=preds, references=refs)

            # accuracy = 0
            # for r,p in zip(reals, preds):
            #     if len(p.strip()) != 0:
            #         if r.strip()[0] == p.strip()[0]:
            #             accuracy+=1
            # accuracy = accuracy/len(preds)
            # print("ACCURACY (comparing first letter): ", accuracy)

            # accuracy = 0
            # for r,p in zip(reals, preds):
            #     if p.strip()==r.strip():
            #         accuracy+=1
            # accuracy = accuracy/len(preds)
            # print("ACCURACY (exact) ", accuracy)

            bleurt_score = bleurt.compute(predictions=preds, references=reals)
            avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
            results_data.append({'num_samples' : len(preds), 'num_demonstrations' : i//3, 'bert_score' : float(average_F1), 'bleu-1' : bleu_score['bleu'], 'bleu-2':bleu_score2['bleu'],
                                'bleu-4':bleu_score4['bleu'], 'bleurt' : avg_bleurt})
            i+=3

    results_df1 = pd.DataFrame(results_data)
    return results_df1

def it_results(ds_name, model_name):

    results_data = []

    # Open the file containing the real values and predictions
    with open(f'it_results/outputs/{ds_name}_{model_name}_it.txt', 'r', encoding='utf-8') as file:
        # Read lines from the file
        lines = file.readlines()
        print(len(lines))
        i=0
        while i < len(lines)-1: # Assuming that every 3 lines correspond to one iteration
            preds = ast.literal_eval(lines[i+1][11:])
            reals = ast.literal_eval(lines[i][11:])
            print(reals)
            print(preds)
            P, R, F1 = bert_score.score(preds, reals, lang="en")
            average_F1 = sum(F1) / len(F1)
            # bert_scores.append(average_F1)
            refs = [[r] for r in reals]
            order = int(sum(len(s) for s in refs)/len(refs))
            bleu_score = bleu.compute(predictions=preds, references=refs, max_order=1)
            bleu_score2 = bleu.compute(predictions=preds, references=refs, max_order = 2)
            bleu_score4 = bleu.compute(predictions=preds, references=refs)
            # rouge_score = rouge.compute(predictions=preds, references=refs)
            
            # accuracy = 0
            # for r,p in zip(reals, preds):
            #     if len(p.strip()) != 0:
            #         if r.strip()[0] == p.strip()[0]:
            #             accuracy+=1
            # accuracy = accuracy/len(preds)
            # print("ACCURACY (comparing first letter): ", accuracy)

            # accuracy = 0
            # for r,p in zip(reals, preds):
            #     if p.strip()==r.strip():
            #         accuracy+=1
            # accuracy = accuracy/len(preds)
            # print("ACCURACY (exact) ", accuracy)

            bleurt_score = bleurt.compute(predictions=preds, references=reals)
            avg_bleurt = sum(bleurt_score['scores'])/len(bleurt_score['scores'])
            results_data.append({'num_samples' : len(preds), 'num_demonstrations' : None, 'bert_score' : float(average_F1), 'bleu-1' : bleu_score['bleu'], 'bleu-2':bleu_score2['bleu'], 
                                'bleu-4':bleu_score4['bleu'], 'bleurt':avg_bleurt})
            i+=3

    results_df2 = pd.DataFrame(results_data)
    return results_df2

datasets = ['ni', 'alpaca', 'medmcq','finance_sent', 'medqa', 'lawqa']
# icl_dfs = []
# for d in datasets:
#     df1 = icl_results('random', d, 'gpt2_small')
#     df1['dataset'] = d
#     icl_dfs.append(df1)
# big_icl_df = pd.concat(icl_dfs, axis=0)

icl_dfs = []
for d in datasets:
    df1 = icl_results('similarity', d, 'mistral')
    df1['dataset'] = d
    icl_dfs.append(df1)
big_icl_df2 = pd.concat(icl_dfs, axis=0)

In [None]:
big_icl_df.head(100)

In [None]:
big_icl_df.drop(['dataset'], axis=1).groupby('num_demonstrations').agg('mean').head(100)

In [None]:
big_icl_df2.head(100)

In [None]:
big_icl_df2.drop(['dataset'], axis=1).groupby('num_demonstrations').agg('mean').head(100)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

zero_shot = big_icl_df2[big_icl_df2['num_demonstrations'] == 0]
one_shot = big_icl_df2[big_icl_df2['num_demonstrations'] == 1]
two_shot = big_icl_df2[big_icl_df2['num_demonstrations'] == 2]


big_icl_df2.loc[(big_icl_df2['dataset'] == 'finance_sent') & (big_icl_df2['num_demonstrations'] == 0), 'bleu-4'] = 0.0
big_icl_df2.loc[(big_icl_df2['dataset'] == 'finance_sent') & (big_icl_df2['num_demonstrations'] == 1), 'bleu-4'] = 0.006
big_icl_df2.loc[(big_icl_df2['dataset'] == 'finance_sent') & (big_icl_df2['num_demonstrations'] == 2), 'bleu-4'] = 0.006

# big_icl_df2.loc[(big_icl_df2['dataset'] == 'medqa') & (big_icl_df2['num_demonstrations'] == 0) , 'bert_score'] = 0.0

var = 'bleu-4'


# Define 'it' scores
# it = {'ni': 0.878, 'alpaca': 0.883, 'medmcq': 0.859, 'finance_sent':0.949, 'medqa': 0.851, 'lawqa': 0.859} #bert
# # it = {'ni': 0.093, 'alpaca': 0.074, 'medmcq': 0.042, 'finance_sent':0.440, 'medqa': 0.019, 'lawqa': 0.06} #bleu
it =  {'ni': 0.016, 'alpaca': 0.064, 'medmcq': 0.002, 'finance_sent':0.130, 'medqa': 0.005, 'lawqa': 0.018} #bleu gpt2ap

# it =  {'ni': 0.816, 'alpaca': 0.883, 'medmcq': 0.805, 'finance_sent':0.821, 'medqa': 0.830, 'lawqa': 0.831} #bert gpt2ap

datasets = zero_shot['dataset'].unique()
it_scores = pd.Series(it).reindex(datasets)

fig, ax = plt.subplots(figsize=(12, 6))
width = 0.2
x = range(len(datasets))

ax.bar(x, zero_shot.groupby('dataset')[var].mean().reindex(datasets), width, label='Zero Shot')


ax.bar([i + width for i in x], one_shot.groupby('dataset')[var].mean().reindex(datasets), width, label='One Shot')

ax.bar([i + 2*width for i in x], two_shot.groupby('dataset')[var].mean().reindex(datasets), width, label='Two Shot')

ax.bar([i + 3*width for i in x], it_scores, width, label='IT')

ax.set_xlabel('Dataset')
ax.set_ylabel('BLEU')
ax.set_title('GPT-2 ICL vs GPT-2 IT (AP)')
ax.set_xticks([i + 1.5*width for i in x])
ax.set_xticklabels(datasets)

category_labels = ['NI', 'AP', 'MEDMCQ', 'ECSA', 'MEDQA', 'LAWQA'] 
ax.set_xticklabels(category_labels)

ax.legend()

plt.xticks(rotation=45) 
plt.tight_layout() 
plt.savefig('gpt2_small_ap_bleu.png') 
plt.show()
