In [13]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.gleu_score import sentence_gleu
import os
from nltk.translate.meteor_score import meteor_score #REQUIRES LATEST NLTK 3.4.5 INSTALLED
"""
DOWNLOAD THIS REPO FROM https://github.com/tylin/coco-caption
PUT coco_caption_master FOLDER INTO WORKING DIR
"""
from coco_caption_master.pycocoevalcap.rouge.rouge import Rouge 
from coco_caption_master.pycocoevalcap.meteor.meteor import Meteor
from coco_caption_master.pycocoevalcap.cider.cider import Cider
from statistics import mean

In [2]:
def bleu_score_comp(ref, pred):

    smooth = SmoothingFunction().method4
  
    score = 0
    for pred_split in pred:
        cur_score = sentence_bleu(ref, pred_split, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
        score = max(score, cur_score)
    
    return score

In [3]:
def gleu_score_comp(ref, pred):
    score = 0
    for pred_split in pred:
        cur_score = sentence_gleu(ref, pred_split, min_len=1, max_len=4)
        score = max(cur_score, score)
    
    return score

In [4]:
def meteor_score_comp(ref, pred):

    lst_ref = []
    lst_ref.append(" ".join(ref[0]))
  
    score = 0
    for pred_split in pred:

        lst_pred = " ".join(pred_split)
        cur_score = meteor_score(lst_ref, lst_pred)
        score = max(cur_score, score)
    return score

In [5]:
def rouge_score_comp(ref, pred):

    lst_ref = []
    lst_ref.append(" ".join(ref[0]))
    dict_ref = { 0: lst_ref}

    score = 0
    lst_pred = []
    for pred_split in pred:

        lst_pred.append(" ".join(pred_split))
        dict_pred = { 0: lst_pred}
        cur_score, _ = Rouge().compute_score(dict_ref, dict_pred)
        lst_pred.pop(0)
        score = max(score, cur_score)

    return score

In [14]:
def cider_score_comp(ref, pred):

    lst_ref = []
    lst_ref.append(" ".join(ref[0]))
    dict_ref = { 0: lst_ref}

    score = 0
    lst_pred = []
    for pred_split in pred:

        lst_pred.append(" ".join(pred_split))
        dict_pred = { 0: lst_pred}
        cur_score, _ = Cider().compute_score(dict_ref, dict_pred)
        lst_pred.pop(0)
        score = max(score, cur_score)

    return score

In [17]:
"""
Format of input to total_test_score:
EXAMPLE:
"""
reference1 = [['this', 'is', 'a', 'test']]
candidate1 = [['this', 'is', 'a', 'test']]

reference2 = [['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']]
candidate2 = [['the', 'quick'],['the', 'quick', 'brown', 'fox']]

test_list = [(reference1,candidate1), (reference2,candidate2)]

def total_test_score(test_list):

    bleu_sum = []
    gleu_sum = []
    meteor_sum = []
    cider_sum = []
    rouge_sum = []
    
    for ref, pred in test_list:
    
        bleu = bleu_score_comp(ref, pred)
        bleu_sum.append(bleu)

        gleu = gleu_score_comp(ref, pred)
        gleu_sum.append(gleu)
        
        #CIDER SCORER NOT WORKING... NEED TO DEBUG/DISCUSS IF NEEDED...
        cider = cider_score_comp(ref, pred)
        cider_sum.append(cider)
        
        rouge = rouge_score_comp(ref, pred)
        rouge_sum.append(rouge)

        meteor = meteor_score_comp(ref, pred)
        meteor_sum.append(meteor)
    
    return mean(bleu_sum), mean(gleu_sum), mean(meteor_sum), mean(rouge_sum), mean(cider_sum)