(a) Warmup. As a warmup, write code to collect statistics about word frequencies in the two languages. Print the 10 most frequent words in each language.

If you're working with Python, using a CounterLinks to an external site. is probably the easiest solution.

Let's assume that we pick a word completely randomly from the European parliament proceedings. According to your estimate, what is the probability that it is speaker? What is the probability that it is zebra?

In [413]:
import pandas as pd
from collections import Counter
import numpy as np
import string

In [414]:
# Function to process file and count occurrences of words, also returns a list of words
def setup(file_path):
    word_counter = Counter()
    to_remove = ['apos', '.']
    # Open file and iterate over lines
    with open(file_path, 'r') as file:
        for line in file:
            # Split line into words
            #line = line.translate(str.maketrans('', '', string.punctuation))
            
            words = line.split()
            words = [word.lower() for word in words if word not in string.punctuation]
            # Update Counter with words
            word_counter.update(words)
  
    with open(file_path, 'r') as file:
        file_to_list = file.read().split('\n')
        for i, sentence in enumerate(file_to_list):
            words = sentence.split()
            words = [word.lower() for word in words if word not in string.punctuation]
            file_to_list[i] = ' '.join(words)
            
    return word_counter, file_to_list

In [415]:
english_de_file_path = 'dat410_europarl/europarl-v7.de-en.lc.en'
german_file_path = 'dat410_europarl/europarl-v7.de-en.lc.de'

english_de_word_counter, english_de_list = setup(english_de_file_path)
german_en_word_counter, german_en_list = setup(german_file_path)


In [416]:
english_sv_file_path = 'dat410_europarl/europarl-v7.sv-en.lc.en'
swedish_file_path = 'dat410_europarl/europarl-v7.sv-en.lc.sv'

english_sv_word_counter, english_sv_list = setup(english_sv_file_path)
swedish_en_word_counter, swedish_en_list = setup(swedish_file_path)

In [417]:
english_fr_file_path = 'dat410_europarl/europarl-v7.fr-en.lc.en'
french_en_file_path = 'dat410_europarl/europarl-v7.fr-en.lc.fr'

english_fr_word_counter, english_fr_list = setup(english_fr_file_path)
french_en_word_counter, french_en_list = setup(french_en_file_path)

In [418]:
#merge the english word counters
english_word_counter = english_de_word_counter + english_sv_word_counter + english_fr_word_counter

print(english_word_counter.most_common(10))
print(german_en_word_counter.most_common(10))
print(swedish_en_word_counter.most_common(10))
print(french_en_word_counter.most_common(10))

[('the', 58790), ('of', 28406), ('to', 26842), ('and', 21459), ('in', 18485), ('is', 13331), ('that', 13219), ('a', 13090), ('we', 9936), ('this', 9916)]
[('die', 10521), ('der', 9374), ('und', 7028), ('in', 4175), ('zu', 3168), ('den', 2976), ('wir', 2863), ('daß', 2738), ('ich', 2670), ('das', 2669)]
[('att', 9181), ('och', 7038), ('i', 5949), ('det', 5687), ('som', 5028), ('för', 4959), ('av', 4013), ('är', 3840), ('en', 3724), ('vi', 3211)]
[('&apos;', 16729), ('de', 14520), ('la', 9746), ('et', 6619), ('l', 6536), ('le', 6174), ('les', 5585), ('à', 5500), ('des', 5232), ('que', 4797)]


In [419]:

# P(word in all languages)
def probability_of_word(word, wordcounters):
    return sum([(word_counter[word]) for word_counter in wordcounters])/ sum([sum(word_counter.values()) for word_counter in wordcounters])
   

In [420]:
# Some examples of word probabilities
print("\"Speaker\" all languages", probability_of_word('speaker', [english_word_counter, swedish_en_word_counter, german_en_word_counter, french_en_word_counter]))
print("\"Speaker\" in English", probability_of_word('speaker', [english_word_counter]))
print("\"The\" all languages",probability_of_word('the', [english_word_counter, swedish_en_word_counter, german_en_word_counter, french_en_word_counter]))
print("\"The\" in English",probability_of_word('the', [english_word_counter]))
print("\"The\" in french",probability_of_word('theatos', [swedish_en_word_counter]))
print("\"The\" in German",probability_of_word('the', [german_en_word_counter]))
print("\"Zebra\" all languages",probability_of_word('zebra', [english_word_counter, swedish_en_word_counter, german_en_word_counter, french_en_word_counter]))
print("\"Zebra\" in English",probability_of_word('zebra', [english_word_counter]))

"Speaker" all languages 2.1349591802274315e-05
"Speaker" in English 4.23327120259538e-05
"The" all languages 0.038048854335616875
"The" in English 0.07541636787896436
"The" in french 4.799909238079862e-05
"The" in German 2.9430065755175486e-05
"Zebra" all languages 0.0
"Zebra" in English 0.0


Implement a bigram model and use it to calculate the probability of a sentence

In [429]:
#implement a bigram model
def count_bigrams(language_list):
    bigram_counter = Counter()

    # Open file and iterate over lines
    for sentence in language_list:
        # Split sentence into words
        words = sentence.split(' ')
        words = [word.lower() for word in words]
        
        # Update Counter with words
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counter.update(bigrams)

    return bigram_counter

# nr such bigrams/ all other bigrams starting with the same word
def probability_of_bigram_in_language(bigram, bigram_counter, word_counter):
    return bigram_counter[bigram] / word_counter[bigram[0]] if word_counter[bigram[0]] > 0 else 0

#probability of sentence is the product of the probabilities of the bigrams and the probability of the word itself
def probability_of_sentence(sentence, bigram_counter, word_counter):
    words = sentence.split()
    words = [word.lower() for word in words]
    bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
    probabilities = []
    #probabilities.append(probability_of_word(words[0], word_counter))
    for bigram in bigrams:
        # If the word doesn't exist yet, we assume a very low probability
        prob = probability_of_bigram_in_language(bigram, bigram_counter, word_counter)
        #probability_of_word(bigram[1], [word_counter])
     
        probabilities.append(prob if prob > 0.000000001 else 0.000001)
        #probabilities.append(prob if prob > 0.1 else 0.1)

    return np.prod(probabilities)

In [426]:
#use it to compute the probability of a short sentence
english_de_bigram_counter = count_bigrams(english_de_list)
print("Probability of \"The speaker is speaking.\" in English", probability_of_sentence("The speaker is speaking.", english_de_bigram_counter, english_de_word_counter))
print("Probability of \"The speaker is speaking.\" in German", probability_of_sentence("The speaker is speaking.", count_bigrams(german_en_list), german_en_word_counter))
print("Probability of \"Of the speaker\" in English", probability_of_sentence("Of the speaker", english_de_bigram_counter, english_de_word_counter))

#What happens if you try to compute the probability of a sentence that contains a word that did not appear in the training texts? 
print("Probability of \"The speaker is speaking Gobbledygook\" in English", probability_of_sentence("The speaker is speaking Gobbledygook", english_de_bigram_counter, english_de_word_counter))

# And what happens if your sentence is very long (e.g. 100 words or more)? 
long_sentence = "The case of Alexander Nikitin has garnered significant attention due to its complex legal implications and broader implications for environmental activism Alexander Nikitin a former naval officer turned environmental activist found himself embroiled in a legal battle with the Russian government over allegations of espionage and divulging state secrets Nikitin's involvement in environmental organizations particularly his work with the Bellona Foundation raised concerns among Russian authorities leading to his arrest and subsequent trial The case sparked international outcry with human rights organizations and environmental activists rallying behind Nikitin viewing his prosecution as an attempt to silence dissent and undermine environmental advocacy Despite facing immense pressure and legal challenges Nikitin persevered ultimately vindicated when the charges against him were dropped The case of Alexander Nikitin serves as a poignant reminder of the importance of protecting environmental activists' rights and the need for robust legal safeguards to uphold freedom of speech and environmental advocacy"
print("Probability of long sentence in English", probability_of_sentence(long_sentence, english_de_bigram_counter, english_de_word_counter))


Probability of "The speaker is speaking." in English 4.5804953347655015e-12
Probability of "The speaker is speaking." in German 9.999999999999999e-19
Probability of "Of the speaker" in English 1.6186135072199902e-05
Probability of "The speaker is speaking Gobbledygook" in English 4.580495334765501e-18
Probability of long sentence in English 0.0


We will now estimate the parameters of the translation model P(f|e)

In [369]:
# Give t values only to those f and e which are in the same indexed sentences are needed
# Initialize t(f|e) randomly
def initialize_parameters(source_list, target_list):
    t_parameters = {}
    for i in range(len(source_list)):
        sentence1 = source_list[i].split(' ')
        sentence2 = target_list[i].split(' ')
        for word1 in sentence1:
            if word1 not in t_parameters:
                t_parameters[word1] = {}
            for word2 in sentence2:
                if word2 not in t_parameters[word1]:
                    t_parameters[word1][word2] = np.random.rand()
    return t_parameters

Implement the EM algorithm

In [370]:
#estimation algorithm for IBM model 1
def EM(list_language1, list_language2):
   
    #initialize t(e|f) uniformly
    t = initialize_parameters(list_language1, list_language2)
    print('initialization done')


    # 10 iterations of the EM algorithm
    for i in range(10):
        #set count(e|f) to 0 for all e,f
        count = {}
        for j in range(len(list_language1)):
            sentence1 = list_language1[j].split(' ')
            sentence2 = list_language2[j].split(' ')
            for word1 in sentence1:
                if word1 not in count:
                    count[word1] = {}
                for word2 in sentence2:
                    count[word1][word2] = 0
        
        #set count(f) to 0 for all f
        total = {}
        for sentence2 in list_language2:
            for word2 in sentence2.split(' '):
                total[word2] = 0

        #for all sentence pairs (e_s,f_s)
        for j in range(len(list_language1)):
            sentence1 = list_language1[j].split(' ')
            sentence2 = list_language2[j].split(' ')
            #for all words e in e_s
            for word1 in sentence1:
                denominator = 0
                temp_counts = {}
                #for all words f in f_s
                for word2 in sentence2:
                    denominator += t[word1][word2]
                    temp_counts[word2] = t[word1][word2]

                if denominator > 0:
                    for word2 in sentence2:
                        count[word1][word2] += temp_counts[word2]/denominator
                        total[word2] += temp_counts[word2]/denominator
        #for all f in F
        for j in range(len(list_language1)):
            sentence1 = list_language1[j].split(' ')
            sentence2 = list_language2[j].split(' ')
            #for all e in E
            for word1 in sentence1:
                #t(e|f) = count(e|f)/total(f)
                for word2 in sentence2:
                    if total[word2] > 0:
                        t[word1][word2] = count[word1][word2]/total[word2]

        #print for german the ten words 'european' is most likely to be translated to
        word_to_translate = 'european'
        if word_to_translate in t:
            # Sort the target dictionary items based on values in descending order
            sorted_target_items = sorted(t[word_to_translate].items(), key=lambda x: x[1], reverse=True)
    
            # Print the top 10 words
            print(f"{word_to_translate}: {sorted_target_items[:10]}")
        else:
            print(f"{word_to_translate} not found in the dictionary.")
        print('iteration', i, 'done')
    return t

In [397]:
t_english_ger = EM(english_de_list, german_en_list)

initialization done
european: [('zwei-klassen-gesellschaft', 0.17768634714976164), ('fbi', 0.16916896958119834), ('flugsicherung', 0.1607652312477656), ('grenzenlos', 0.15215617209540824), ('csu-europaabgeordneten', 0.15078337929486102), ('aufrechterhalten', 0.147624136070217), ('funktionsfähigkeit', 0.1372640944380807), ('zentraler', 0.13503498832778854), ('kulturraumes', 0.1318015910046665), ('botschaften', 0.12939807660288574)]
iteration 0 done
european: [('europäischer', 0.17794985173252004), ('mitteleuropas', 0.17768655741629086), ('funktionsfähigkeit', 0.15640529098798506), ('nordeuropäischer', 0.15428607375227596), ('fbi', 0.15411094527079552), ('zentraler', 0.14445579068280814), ('aufrechterhalten', 0.13730133215826335), ('grenzenlos', 0.13529870938773148), ('rechtshängig', 0.13462078724479248), ('flugsicherung', 0.13291707468698552)]
iteration 1 done
european: [('europäischer', 0.3958234864454802), ('europäisches', 0.24108895437649797), ('europäischen', 0.23486666979191687), (

In [386]:
#t = EM(english_de_list, german_en_list)
t_ger = EM(german_en_list, english_de_list)

initialization done
european not found in the dictionary.
iteration 0 done
european not found in the dictionary.
iteration 1 done
european not found in the dictionary.
iteration 2 done
european not found in the dictionary.
iteration 3 done
european not found in the dictionary.
iteration 4 done
european not found in the dictionary.
iteration 5 done
european not found in the dictionary.
iteration 6 done
european not found in the dictionary.
iteration 7 done
european not found in the dictionary.
iteration 8 done
european not found in the dictionary.
iteration 9 done


Define and implement an algorithm to find a translation, given a sentence in the source language. That is, you should try to find
E* = argmaxE P(E|F)

In [406]:
#translation of a sentence is calculated by product of sentence probability p(t) and the translation probability p(e|t)
def translate_sentence(sentence, t, word_counter_bigrams, word_counter):
    words = sentence.split(' ')
    words = [word.lower() for word in words]

    translated_sentence = []
    translated_sentence.append('NULL')

    for word in words:
        probabilities = []
        top_ten_translations = sorted(
            [(key, inner_dict[word]) for key, inner_dict in t.items() if word in inner_dict],
            key=lambda x: x[1],  # Sorting key based on the value (second element of the tuple)
            reverse=True
            )[:10]  # Selecting the top ten
        #top_ten_translations = sorted(t[word].items(), key=lambda x: x[1], reverse=True)
        
        for word1, t_value in top_ten_translations:
            temp_sentence = ' '.join(translated_sentence)
            probabilities.append(probability_of_sentence(temp_sentence + ' ' + word1, word_counter_bigrams, word_counter)*t_value)
        
            #probabilities.append(value)
            
        translated_sentence.append(top_ten_translations[np.argmax(probabilities)][0])

    return translated_sentence[1:]


In [430]:
#translating sentence with english as source language and german as target language
german_en_word_counter_bigrams = count_bigrams(german_en_list)
translation = translate_sentence('I am a european speaking here', t_ger, german_en_word_counter_bigrams, german_en_word_counter)
translation = ' '.join(translation)
print('I am a european speaking here')
print(translation)
translation = translate_sentence('A cat was here today and spoke about some things', t_ger, german_en_word_counter_bigrams, german_en_word_counter)
translation = ' '.join(translation)
print('A cat was here today and spoke about some things')
print(translation)

I am a european speaking here
ich ich eine europäischen spreche hier
A cat was here today and spoke about some things
eine ausgeht wurde hier heute und gesprochen über einige dinge
