In [33]:
# imports
import numpy as np

In [36]:
# Task 0. Unigram BLEU score
def uni_bleu(references, sentence):
    """
    Calculates the unigram BLEU score for a sentence.

    Args:
        references: list of reference translations
            - Each reference translataion is a list of the words in the
              translation
        sentnece: list containing the model proposed sentnece

    Returns:
        the unigram BLEU score

    My Notes:
        The final output is the brevity penalty multiplied by the number of
        words from our machine translated sentence show up in our references
        divided by the number of words in the machine translated sentence

        Example:
            machine_translation: "there is a cat here"
            ref1: "the cat is on the mat"
            ref2: "there is a cat on the mat"

        brevity penalty if candidate is shorter than any reference, else 1:
            e^(1-r/c)
            r: length of reference sentence that is closest to length of
                machine translated sentence
            c: length of machine translated sentnece

        In our candidate: [there, is, a cat] are all in our references = 4
        Candidate is 5 long

        We finally end up with (4/5) * (e^(1-(6/5)))
    """
    # Calculate the first number, P = m/w_t
    # m = number of words from sentence in references
    # w_t = number of words in sentence
    w_t = len(sentence)
    m = 0
    corpus = []

    for reference in references:
        for word in sentence:
            if word in reference and word not in corpus:
                corpus.append(word)

    m = len(corpus)
    P = m / w_t

    ref_len = min(len(reference) for reference in references)

    if w_t < ref_len:
        BP = np.exp(1-(ref_len/w_t))
    else:
        BP = 1

    return P * BP

In [37]:
# 0-main
references = [["the", "cat", "is", "on", "the", "mat"], ["there", "is", "a", "cat", "on", "the", "mat"]]
sentence = ["there", "is", "a", "cat", "here"]

print(uni_bleu(references, sentence))

0.6549846024623855
