## Q 1.1: Language models

In [7]:
import nltk 
from nltk.probability import ConditionalFreqDist

with open('europarl.fr-en.short.en.txt', 'r', encoding='utf-8') as f:
    europarlText = f.read().lower().split()
brownText = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

europarlBigrams = list(nltk.bigrams(europarlText))
brownBigrams = list(nltk.bigrams(brownText))

europarlCfd = ConditionalFreqDist(europarlBigrams)
brownCfd = ConditionalFreqDist(brownBigrams)

probEuroparlAskYou = europarlCfd['ask']['you'] / float(sum(europarlCfd['ask'].values()))
probEuroparlTheEuropean =europarlCfd['the']['european'] / float(sum(europarlCfd['the'].values()))

probBrownAskYou = brownCfd['ask']['you'] / float(sum(brownCfd['ask'].values()))
probBrownTheEuropean = brownCfd['the']['european'] / float(sum(brownCfd['the'].values()))

print(f'The probability of occurrence of the word “European” following the word “the” in the Brown text is {probBrownTheEuropean}.')
print(f'The probability of occurrence of the word “European” following the word “the” in the Europarl text is {probEuroparlTheEuropean}.')

print(f'The probability of occurrence of the word “you” following the word “ask” in the Brown text is {probBrownAskYou}.')
print(f'The probability of occurrence of the word “you” following the word “ask” in the Europarl text is {probEuroparlAskYou}.')


The probability of occurrence of the word “European” following the word “the” in the Brown text is 0.0002286661616955596.
The probability of occurrence of the word “European” following the word “the” in the Europarl text is 0.03723404255319149.
The probability of occurrence of the word “you” following the word “ask” in the Brown text is 0.046875.
The probability of occurrence of the word “you” following the word “ask” in the Europarl text is 0.5.


## Q 1.3:  N-gram based MT evaluation

In [None]:
from collections import Counter
import string
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

translation =['Thus the heavens and the earth were finished, and all the host of them',
              'And on the seventh day God finished his work which he had made, and he rested on the seventh day from all his work which he had made',
              'And he blessed the seventh day, and sanctified it: because that in it he had rested from all his work which God created to make.']

refHolyBible=['Thus the heavens and the earth were finished, and all the host of them.'
,'And on the seventh day God ended his work which he had made; and he rested on the seventh day from all his work which he had made.'
,'And God blessed the seventh day, and sanctified it: because that in it he had rested from all his work which God created and made.']

refOremusBible=['Thus the heavens and the earth were finished, and all their multitude.',
      'And on the seventh day God finished the work that he had done, and he rested on the seventh day from all the work that he had done.',
      'So God blessed the seventh day and hallowed it, because on it God rested from all the work that he had done in creation.']

def getBigrams(sentence):
    tokens = word_tokenize(sentence.lower())
    return list(ngrams(tokens,2))

def bleuLike(translation,ref1,ref2):
    transAllBigrams =[]
    matchesPerSentence=[]
    scorePerSentence=[]
    for i in range(len(translation)):    
        transBigrams=getBigrams(translation[i])
        transAllBigrams.append(transBigrams)
        refBigrams = set(getBigrams(ref1[i])) | set(getBigrams(ref2[i]))
        sharedBigrams= [bigram for bigram in transBigrams if bigram in refBigrams]
        matchesPerSentence.append(sharedBigrams)
        if len(transBigrams) > 0:
            scorePerSentence.append([len(sharedBigrams) / len(transBigrams),len(sharedBigrams),len(transBigrams)])
        else:
            scorePerSentence.append(0)
    return scorePerSentence, matchesPerSentence, transAllBigrams

def generateLatexTable(bigrams,matches,title):
    latex_table = f"\\begin{{table}}[ht]\n\\centering\n\\begin{{tabular}}{{|c|c|}}\n\\hline\nBigramm & Match (Yes/No) \\\\ \\hline\n"
    for bigram in bigrams:
        if bigram in matches:
            latex_table += f"{bigram} & yes \\\\ \\hline\n"
        else:
            latex_table += f"{bigram} & no \\\\ \\hline\n"
    
    latex_table += f"\\end{{tabular}}\n\\caption{{{title}}}\n\\end{{table}}"
    return latex_table

def bleuScore(values):
    product =1
    for value in values:
        product*=value[0]
    i=len(values)
    return product**(1/i)

scores,bigramMatches,transBigrams=bleuLike(translation,refHolyBible,refOremusBible)
# print(generateLatexTable(transBigrams[2],bigramMatches[2],'Verse 3'))
print(f'The overall BLEU Score is {bleuScore(scores)}')
print(f'The Verse 1 has {scores[0][1]} matches out of {scores[0][2]} bigrams resulting in a score of {scores[0][0]}')
print('List of shared bigrams for Verse 1',bigramMatches[0])
print(f'The Verse 2 has {scores[1][1]} matches out of {scores[1][2]} bigrams resulting in a score of {scores[1][0]}')
print('List of shared bigrams for Verse 2',bigramMatches[1])
print(f'The Verse 3 has {scores[2][1]} matches out of {scores[2][2]} bigrams resulting in a score of {scores[2][0]}')
print('List of shared bigrams for Verse 3',bigramMatches[2])


The overall BLEU Score is 0.9112231495571782
The Verse 1 has 14 matches out of 14 bigrams resulting in a score of 1.0
List of shared bigrams for Verse 1 [('thus', 'the'), ('the', 'heavens'), ('heavens', 'and'), ('and', 'the'), ('the', 'earth'), ('earth', 'were'), ('were', 'finished'), ('finished', ','), (',', 'and'), ('and', 'all'), ('all', 'the'), ('the', 'host'), ('host', 'of'), ('of', 'them')]
The Verse 2 has 26 matches out of 28 bigrams resulting in a score of 0.9285714285714286
List of shared bigrams for Verse 2 [('and', 'on'), ('on', 'the'), ('the', 'seventh'), ('seventh', 'day'), ('day', 'god'), ('god', 'finished'), ('his', 'work'), ('work', 'which'), ('which', 'he'), ('he', 'had'), ('had', 'made'), (',', 'and'), ('and', 'he'), ('he', 'rested'), ('rested', 'on'), ('on', 'the'), ('the', 'seventh'), ('seventh', 'day'), ('day', 'from'), ('from', 'all'), ('all', 'his'), ('his', 'work'), ('work', 'which'), ('which', 'he'), ('he', 'had'), ('had', 'made')]
The Verse 3 has 22 matches ou