In [13]:
import re
from nlgeval import NLGEval
from nlgeval import compute_individual_metrics
import json, pickle
import evaluate
from datasets import list_metrics

In [14]:
#!pip install git+https://github.com/Maluuba/nlg-eval.git@master
!pip install pycocoevalcap



In [3]:
#for metric in list_metrics():
#    print(metric)

In [4]:
#rouge = evaluate.load("rouge")
#bleu = evaluate.load("bleu")
#meteor = evaluate.load("meteor")

### Compute metrics BLEU-n and METEOR with NLTK

In [2]:
def extract_scores_from_rtf(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    # Use regular expressions to extract scores
    bleu_scores = re.findall(r'BLEU score -> (\d+\.\d+)', text)
    meteor_scores = re.findall(r'METEOR-score -> (\d+\.\d+)', text)
    # Convert strings to floats
    bleu_scores = [float(score) for score in bleu_scores]
    meteor_scores = [float(score) for score in meteor_scores]
    return bleu_scores, meteor_scores

# Example usage:
bleu3_scores, meteor_scores = extract_scores_from_rtf("C:/Users/admitos/Desktop/ThesisUU/Results/MLP_no_beam_search.rtf")
print("For MLP mapper with no beam search:")
print("     BLEU-3 zero-shot average score: {}".format(round(sum(bleu3_scores)/len(bleu3_scores),3)))
print("     METEOR zero-shot average score: {}".format(round(sum(meteor_scores)/len(meteor_scores),3)))
print("--------------------------------------------")
bleu3_scores, meteor_scores = extract_scores_from_rtf("C:/Users/admitos/Desktop/ThesisUU/Results/MLP_with_beam_search.rtf")
print("For MLP mapper with beam search:")
print("     BLEU-3 zero-shot average score: {}".format(round(sum(bleu3_scores)/len(bleu3_scores),3)))
print("     METEOR zero-shot average score: {}".format(round(sum(meteor_scores)/len(meteor_scores),3)))
print("--------------------------------------------")
bleu3_scores, meteor_scores = extract_scores_from_rtf("C:/Users/admitos/Desktop/ThesisUU/Results/Transformer_no_beam_search.rtf")
print("For Transformer mapper with no beam search:")
print("     BLEU-3 zero-shot average score: {}".format(round(sum(bleu3_scores)/len(bleu3_scores),3)))
print("     METEOR zero-shot average score: {}".format(round(sum(meteor_scores)/len(meteor_scores),3)))
print("--------------------------------------------")
bleu3_scores, meteor_scores = extract_scores_from_rtf("C:/Users/admitos/Desktop/ThesisUU/Results/Transformer_with_beam_search.rtf")
print("For Transformer mapper with beam search:")
print("     BLEU-3 zero-shot average score: {}".format(round(sum(bleu3_scores)/len(bleu3_scores),3)))
print("     METEOR zero-shot average score: {}".format(round(sum(meteor_scores)/len(meteor_scores),3)))

For MLP mapper with no beam search:
     BLEU-3 zero-shot average score: 0.429
     METEOR zero-shot average score: 0.155
--------------------------------------------
For MLP mapper with beam search:
     BLEU-3 zero-shot average score: 0.362
     METEOR zero-shot average score: 0.153
--------------------------------------------
For Transformer mapper with no beam search:
     BLEU-3 zero-shot average score: 0.617
     METEOR zero-shot average score: 0.298
--------------------------------------------
For Transformer mapper with beam search:
     BLEU-3 zero-shot average score: 0.625
     METEOR zero-shot average score: 0.308


### Compute also CIDEr with NLGeval library

In [5]:
# from cidereval import cider, ciderD

# # Candidate caption generated by an image captioning model.
# preds = ["A woman is throwing a frisbee in a park."]

# # Reference captions that are considered ground truth descriptions of the image.
# refs = ["A woman is throwing a frisbee in the park."]

# cider(predictions=preds, references=refs)
# #cider_scores is a dict-like object with "avg_score" and "scores"

In [4]:
# Example candidate captions
candidate_captions = ["A person riding a surf board on top of a wave.", "A woman standing on a beach with a surfboard."]

# Corresponding reference captions
#reference_captions = [ ["A surfer rides on a wave.", "A person is surfing in the sea."], 
#                      ["A woman is standing by the sea with a surfboard.", "A lady at the beach with a surfboard."] ]

reference_captions = [ ["A surfer rides on a wave.", "A person is surfing in the sea.", "A man surfes on the water"], 
                     ["A woman is standing by the sea with a surfboard.", "A lady at the beach with a surfboard.", "A woman is on the beach ready to do surfboard"] ]

reference_captions2 = [ "A surfer rides on a wave.", "A lady at the beach with a surfboard."]  

In [5]:
#metrics_dict = compute_individual_metrics(reference_captions[0][0], candidate_captions[0])

### Zero-shot results

In [9]:
file_path_transformer_no = 'C:/Users/admitos/Desktop/ThesisUU/Results/zero-shot/Transformer/generated_dict_no_beam.json'
file_path_transformer_with = 'C:/Users/admitos/Desktop/ThesisUU/Results/zero-shot/Transformer/generated_dict_with_beam.json'

file_path_mlp_no = 'C:/Users/admitos/Desktop/ThesisUU/Results/zero-shot/MLP/generated_dict_no_beam.json'
file_path_mlp_with = 'C:/Users/admitos/Desktop/ThesisUU/Results/zero-shot/MLP/generated_dict_with_beam.json'

file_path_original = 'C:/Users/admitos/Desktop/ThesisUU/Results/zero-shot/Transformer/original_dict.json'
#file_path_original = 'C:/Users/admitos/Desktop/ThesisUU/Results/zero-shot/MLP/original_dict.json'

with open(file_path_transformer_no, 'r') as file:
    capts_trans_no = json.load(file)

with open(file_path_transformer_with, 'r') as file:
    capts_trans_with = json.load(file)

with open(file_path_mlp_no, 'r') as file:
    capts_mlp_no = json.load(file)

with open(file_path_mlp_with, 'r') as file:
    capts_mlp_with = json.load(file)

with open(file_path_original, 'r') as file:
    capts_original = json.load(file)

In [10]:
reference_captions = list(capts_original.values())

In [11]:
# Initialize NLGEval object (it can be reused across multiple evaluations)
nlg = NLGEval(metrics_to_omit=['SPICE', 'SkipThoughtCS', 'GreedyMatchingScore', 'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilairty'])

def evaluating_function(original_list, generated_list):
    
    all_scores = nlg.compute_metrics(original_list, generated_list)
    #indiv_scores = nlg.compute_individual_metrics(reference_captions[1], candidate_captions[1])
    print(all_scores)
    #print(indiv_scores)

Will try with limiting Meteor to 1GB of memory but this might cause issues.
If you have problems using Meteor, then you can try to lower the `mem` variable in meteor.py


In [12]:
#Loop through your candidates and references to compute scores
# for candidate, references in zip(candidate_captions, reference_captions):
#     metrics_dict = nlg.compute_individual_metrics(references, candidate)
#     print("Bleu_1 Score:", metrics_dict['Bleu_1'])
#     print("Bleu_2 Score:", metrics_dict['Bleu_2'])
#     print("Bleu_3 Score:", metrics_dict['Bleu_3'])
#     print("Bleu_4 Score:", metrics_dict['Bleu_4'])
#     print("METEOR Score:", metrics_dict['METEOR'])
#     print("CIDEr Score:", metrics_dict['CIDEr'])
#     print()

candidate_captions = list(capts_trans_with.values())
evaluating_function(reference_captions, candidate_captions)

{'Bleu_1': 0.3147882129129884, 'Bleu_2': 0.12355194602085695, 'Bleu_3': 0.05723158958890914, 'Bleu_4': 0.028954815721220162, 'METEOR': 0.0870333508383237, 'ROUGE_L': 0.22679411434816524, 'CIDEr': 0.03762877675970936}


In [5]:
candidate_captions = list(capts_trans_no.values())
evaluating_function(reference_captions, candidate_captions)

{'Bleu_1': 0.3222847948511601, 'Bleu_2': 0.12121009215529327, 'Bleu_3': 0.05239393364311375, 'Bleu_4': 0.025491233031249255, 'METEOR': 0.08460612497190928, 'ROUGE_L': 0.2266402211373565, 'CIDEr': 0.03384571251694265}


In [11]:
candidate_captions = list(capts_mlp_with.values())
evaluating_function(reference_captions, candidate_captions)

{'Bleu_1': 0.30614964479238366, 'Bleu_2': 0.11545630441117481, 'Bleu_3': 0.046207660055967055, 'Bleu_4': 0.02231163618308306, 'METEOR': 0.07832517036996398, 'ROUGE_L': 0.21414408712604802, 'CIDEr': 0.028366967541854853}


In [12]:
candidate_captions = list(capts_mlp_no.values())
evaluating_function(reference_captions, candidate_captions)

{'Bleu_1': 0.32719582773742006, 'Bleu_2': 0.12297632179760183, 'Bleu_3': 0.05081132452866725, 'Bleu_4': 0.02403849166840267, 'METEOR': 0.08682122283421075, 'ROUGE_L': 0.22944874491053605, 'CIDEr': 0.03094926602833083}


### Fine-tuning

In [13]:
#### --------------------------- Load Results from epoch 10/10 (last epoch) -------------------------------
trans_prefix_gpt_e9 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/Transformer/trans_prefix_GPT_e9.pkl'
trans_prefix_only_e9 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/Transformer/trans_prefix_only_e9.pkl'

mlp_prefix_gpt_e9 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/MLP/mlp_prefix_GPT_e9.pkl'
mlp_prefix_only_e9 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/MLP/mlp_prefix_only_e9.pkl'

with open(trans_prefix_gpt_e9, 'rb') as file:
    data_trans_e9_gpt = pickle.load(file)

with open(trans_prefix_only_e9, 'rb') as file:
    data_trans_e9_only = pickle.load(file)

with open(mlp_prefix_gpt_e9, 'rb') as file:
    data_mlp_e9_gpt = pickle.load(file)

with open(mlp_prefix_only_e9, 'rb') as file:
    data_mlp_e9_only = pickle.load(file)

In [14]:
#### --------------------------- Load Results from epoch 6/10 -------------------------------
trans_prefix_gpt_e5 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/Transformer/trans_prefix_GPT_e5.pkl'
trans_prefix_only_e5 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/Transformer/trans_prefix_only_e5.pkl'

mlp_prefix_gpt_e5 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/MLP/mlp_prefix_GPT_e5.pkl'
mlp_prefix_only_e5 = 'C:/Users/admitos/Desktop/ThesisUU/Results/fine-tuning/MLP/mlp_prefix_only_e5.pkl'

with open(trans_prefix_gpt_e5, 'rb') as file:
    data_trans_e5_gpt = pickle.load(file)

with open(trans_prefix_only_e5, 'rb') as file:
    data_trans_e5_only = pickle.load(file)

with open(mlp_prefix_gpt_e5, 'rb') as file:
    data_mlp_e5_gpt = pickle.load(file)

with open(mlp_prefix_only_e5, 'rb') as file:
    data_mlp_e5_only = pickle.load(file)

In [16]:
def correcting_func(mydata):
  new_data = {}
  for key in mydata.keys():
    if key in capts_original.keys():
      new_data[key] = mydata[key]

  return new_data

In [17]:
### ------------ Correction for e9 ----------------
data_trans_e9_gpt = correcting_func(data_trans_e9_gpt)
data_trans_e9_only = correcting_func(data_trans_e9_only)
data_mlp_e9_gpt = correcting_func(data_mlp_e9_gpt)
data_mlp_e9_only = correcting_func(data_mlp_e9_only)

4992


In [18]:
### ------------ Correction for e5 ----------------
data_trans_e5_gpt = correcting_func(data_trans_e5_gpt)
data_trans_e5_only = correcting_func(data_trans_e5_only)
data_mlp_e5_gpt = correcting_func(data_mlp_e5_gpt)
data_mlp_e5_only = correcting_func(data_mlp_e5_only)

In [21]:
##### ---------- Zero-shot ------------ #####
# candidate_captions1 = list(data1.values())
# candidate_captions2 = list(data2.values())
# candidate_captions3 = list(data3.values())
# candidate_captions4 = list(data4.values())
##### ----------- Original ----------- #####
# reference_captions = list(data5.values())


##### --------- Fine-tuning (e9) --------- #####
candidate_captions_e9_tr_gpt = list(data_trans_e9_gpt.values())
candidate_captions_e9_tr_only = list(data_trans_e9_only.values())
candidate_captions_e9_mlp_gpt = list(data_mlp_e9_gpt.values())
candidate_captions_e9_mlp_only = list(data_mlp_e9_only.values())

##### --------- Fine-tuning (e5) --------- #####
candidate_captions_e5_tr_gpt = list(data_trans_e5_gpt.values())
candidate_captions_e5_tr_only = list(data_trans_e5_only.values())
candidate_captions_e5_mlp_gpt = list(data_mlp_e5_gpt.values())
candidate_captions_e5_mlp_only = list(data_mlp_e5_only.values())

### For epoch 10:

In [22]:
evaluating_function(reference_captions, candidate_captions_e9_tr_gpt)

{'Bleu_1': 0.2506777641553375, 'Bleu_2': 0.08593600284915895, 'Bleu_3': 0.03322308385141391, 'Bleu_4': 0.014654140738642157, 'METEOR': 0.07986446368126571, 'ROUGE_L': 0.1854841116454741, 'CIDEr': 0.023879093348784127}


In [23]:
evaluating_function(reference_captions, candidate_captions_e9_tr_only)

{'Bleu_1': 0.23818789116969488, 'Bleu_2': 0.08130207280426228, 'Bleu_3': 0.027632748363143805, 'Bleu_4': 0.011666219835587547, 'METEOR': 0.07812673659736982, 'ROUGE_L': 0.18563350520754726, 'CIDEr': 0.019060848182743333}


In [24]:
evaluating_function(reference_captions, candidate_captions_e9_mlp_gpt)

{'Bleu_1': 0.2509768818962655, 'Bleu_2': 0.08520342019286431, 'Bleu_3': 0.032187505121328557, 'Bleu_4': 0.01402291493831359, 'METEOR': 0.08038749655701444, 'ROUGE_L': 0.1858470497707727, 'CIDEr': 0.023836409882343763}


In [25]:
evaluating_function(reference_captions, candidate_captions_e9_mlp_only)

{'Bleu_1': 0.20266787504601938, 'Bleu_2': 0.06974205964621807, 'Bleu_3': 0.02559184949116316, 'Bleu_4': 0.011076429794661188, 'METEOR': 0.07459261488664136, 'ROUGE_L': 0.18177541226752716, 'CIDEr': 0.020796451208892754}


### For epoch 6:

In [26]:
evaluating_function(reference_captions, candidate_captions_e5_tr_gpt)

{'Bleu_1': 0.24878512753998708, 'Bleu_2': 0.08460214326005787, 'Bleu_3': 0.031604235290285734, 'Bleu_4': 0.013853554754794785, 'METEOR': 0.07943185484179349, 'ROUGE_L': 0.1860154827262495, 'CIDEr': 0.022795289634907803}


In [27]:
evaluating_function(reference_captions, candidate_captions_e5_tr_only)

{'Bleu_1': 0.24472811671087127, 'Bleu_2': 0.08409955158022811, 'Bleu_3': 0.03099841910839339, 'Bleu_4': 0.012887547084310723, 'METEOR': 0.07874180063540699, 'ROUGE_L': 0.1888872697748369, 'CIDEr': 0.020575613869880927}


In [28]:
evaluating_function(reference_captions, candidate_captions_e5_mlp_gpt)

{'Bleu_1': 0.2456343999192449, 'Bleu_2': 0.08234270734142898, 'Bleu_3': 0.03122141934275247, 'Bleu_4': 0.013462877431879055, 'METEOR': 0.07992151276995967, 'ROUGE_L': 0.18456010328124256, 'CIDEr': 0.02364662755438925}


In [29]:
evaluating_function(reference_captions, candidate_captions_e5_mlp_only)

{'Bleu_1': 0.22763526295875552, 'Bleu_2': 0.08126273766109549, 'Bleu_3': 0.031676766021828916, 'Bleu_4': 0.014646608975552188, 'METEOR': 0.07851058946348491, 'ROUGE_L': 0.19419798526844712, 'CIDEr': 0.02454063893209737}


In [13]:
def _strip(s):
    return s.strip()

#ref_list = [list(map(_strip, refs)) for refs in zip(*reference_captions)]
refs = {idx: strippedlines for (idx, strippedlines) in enumerate(reference_captions)}
hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(candidate_captions)}

In [14]:
#print((ref_list))
print((hyps))
print((refs))
for refs in zip(*reference_captions):
    print(refs)

{0: ['A person riding a surf board on top of a wave.'], 1: ['A woman standing on a beach with a surfboard.']}
{0: ['A surfer rides on a wave.', 'A person is surfing in the sea.', 'A man surfes on the water'], 1: ['A woman is standing by the sea with a surfboard.', 'A lady at the beach with a surfboard.', 'A woman is on the beach ready to do surfboard']}


In [15]:
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

In [39]:
annotation_file = 'C:/Users/admitos/Desktop/ThesisUU/DII-annotation/test.description-in-isolation.json'
#annotation_file = 'C:/Users/admitos/Desktop/ThesisUU/Results/captions_val2014.json'

results_file = 'C:/Users/admitos/Desktop/ThesisUU/Results/captions_val2014_fakecap_results.json'

In [36]:
with open(results_file, 'rb') as f:
    my_annot_file = json.load(f)

In [37]:
# dict_keys(['info', 'images', 'licenses', 'type', 'annotations'])
print((my_annot_file))

[{'image_id': 404464, 'caption': 'black and white photo of a man standing in front of a building'}, {'image_id': 380932, 'caption': 'group of people are on the side of a snowy field'}, {'image_id': 565778, 'caption': 'train traveling down a train station'}, {'image_id': 431573, 'caption': 'red fire hydrant sitting on a park bench in front of a road'}, {'image_id': 322226, 'caption': 'black and white cat is sitting on top of a wooden bench'}, {'image_id': 237669, 'caption': 'baseball player swinging a bat at a game'}, {'image_id': 351053, 'caption': 'laptop computer sitting on top of a table'}, {'image_id': 344860, 'caption': 'zebra standing on top of a lush green field'}, {'image_id': 40102, 'caption': 'group of giraffes standing next to each other in a grassy field'}, {'image_id': 95427, 'caption': 'close up of a pile of oranges sitting on a table'}, {'image_id': 510755, 'caption': 'couple of a motorcycle parked in front of a lush green field'}, {'image_id': 399012, 'caption': 'close 

In [40]:
# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)

loading annotations into memory...
Done (t=0.41s)
creating index...


TypeError: list indices must be integers or slices, not str

In [30]:
coco_eval = COCOEvalCap(coco, coco_result)

# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
coco_eval.params['image_id'] = coco_result.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()

# print output evaluation scores
for metric, score in coco_eval.eval.items():
    print(f'{metric}: {score:.3f}')

tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 9893, 'reflen': 9855, 'guess': [9893, 8893, 7893, 6893], 'correct': [5732, 2510, 1043, 423]}
ratio: 1.003855910705124
Bleu_1: 0.579
Bleu_2: 0.404
Bleu_3: 0.279
Bleu_4: 0.191
computing METEOR score...
METEOR: 0.195
computing Rouge score...
ROUGE_L: 0.396
computing CIDEr score...
CIDEr: 0.600
computing SPICE score...


CalledProcessError: Command '['java', '-jar', '-Xmx2G', 'spice-1.0.jar', 'c:\\Python37\\lib\\site-packages\\pycocoevalcap\\spice\\tmp\\tmpmndwyw7k', '-cache', 'c:\\Python37\\lib\\site-packages\\pycocoevalcap\\spice\\cache', '-out', 'c:\\Python37\\lib\\site-packages\\pycocoevalcap\\spice\\tmp\\tmpdaksa6la', '-subset', '-silent']' returned non-zero exit status 1.