In [2]:
# Extracting German to english Corpus
import requests

filename = 'de-en.tgz'
# ...extract zip file
import tarfile

tar = tarfile.open(filename, mode='r')
tar.extractall()
tar.close()

In [3]:
# Extracting French to english Corpus
import requests

filename = 'fr-en.tgz'
# ...extract zip file
import tarfile

tar = tarfile.open(filename, mode='r')
tar.extractall()
tar.close()

### Problem 1 - choosing French to English

In [23]:
# Load Data
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [24]:
# split a loaded document into sentences
def to_sentences(doc):
    return doc.strip().split('\n')

In [25]:
# shortest and longest sentence lengths
def sentence_lengths(sentences):
    lengths = [len(s.split()) for s in sentences]
    return min(lengths), max(lengths)

In [46]:
# load English data
filename = 'fr-en.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=1000, min=1, max=79


In [47]:
# load French data
filename = 'fr-en.fr'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('French data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

French data: sentences=1000, min=2, max=80


In [1]:
# Removing punctualition making words to lower case
def clean_lines(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    return cleaned

In [2]:
# save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [14]:
from __future__ import print_function
import sys
from collections import defaultdict
import time
import collections
import logging
import pandas as pd


Problem 1: Initialize with uniform probability

In [4]:
#Initialize t(f|e) values with uniform distribution
def init_t(source_corpus, foreign_corpus):
    source = open(source_corpus, "r")
    foreign = open(foreign_corpus, "r")
    unique_f = len(set(w.lower() for w in open(foreign_corpus).read().split()))
    t = collections.defaultdict(dict)
    for e_line, f_line in zip(open(source_corpus), open(foreign_corpus)):
        e_set = set(e_line.strip().split(" "))
        e_set.add("NULL")
        for e in e_set:
            for f in set(f_line.strip().split(" ")):
                # t(f|e) is given by 1 over unique foreign words + 1 (NULL)
                t[e][f] = 1/float(unique_f + 1)
    return t

In [None]:
 t = init_t("fr-en.en","fr-en.fr")

In [16]:
Prob = pd.DataFrame(t)
Prob

Unnamed: 0,of,the,Resumption,NULL,session,happy,year,festive,resumed,European,...,Bruno,anti-competition,Leoni,warned,heed,Eieck,users,hindrances,choice,interference
!,0.000000e+00,0.000000e+00,,0.000000e+00,,,,,,0.000000e+00,...,,,,,,,,,,
"""",0.000000e+00,0.000000e+00,,0.000000e+00,,,,,,0.000000e+00,...,,,,,,,,,,
%,0.000000e+00,0.000000e+00,,0.000000e+00,,,0.000000e+00,,,0.000000e+00,...,,,,,,,,,,
',1.169329e-70,4.271191e-130,,4.657360e-144,,,0.000000e+00,,,0.000000e+00,...,,,,,,,,,,
(,0.000000e+00,0.000000e+00,,0.000000e+00,,,,,,0.000000e+00,...,,,,,,,,,,
),0.000000e+00,0.000000e+00,,0.000000e+00,,,,,,0.000000e+00,...,,,,,,,,,,
",",3.367194e-02,1.057705e-01,,1.122712e-03,,0.0,2.565763e-54,,,6.560602e-85,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000
-,0.000000e+00,0.000000e+00,,0.000000e+00,,,0.000000e+00,,,0.000000e+00,...,,,,,,,,,,
-1999,0.000000e+00,0.000000e+00,,0.000000e+00,,,,,,,...,,,,,,,,,,
-20,0.000000e+00,,,0.000000e+00,,,,,,,...,,,,,,,,,,


Problem 2: Initialize IBM model 1

In [5]:
# IBM model 1
def model1(source_corpus, foreign_corpus, t):
    for s in range(0, 1000):
        count = defaultdict(int)
        count_e = defaultdict(int)
        for e_line, f_line in zip(open(source_corpus), open(foreign_corpus)):
            # Go over words in f, e
            for f in list(f_line.strip().split(" ")):
                # include NULL for possible source alignments
                e_list = list(e_line.strip().split(" "))
                e_list.append('NULL')
                # Calculate sum first
                sum_e = 0
                for e in e_list:
                    sum_e += t[e][f]
                for e in e_list:
                    # Update rule
                    count[(f,e)] += (t[e][f] / float(sum_e))
                    count_e[e] += (t[e][f] / float(sum_e))
        # Update t values
        for (f,e) in count.keys():
            t[e][f] = float(count[(f,e)]) / float(count_e[e])
    return t

In [6]:
#Find alignments for the first 20 sentence pairs in the training data, save to output file
def model_alignments_fr_en(t):
    source = open("fr-en.en", "r")
    foreign = open("fr-en.fr", "r")
    Align = open("Align", "w")
    for j in range(0,20):
        line_e = source.readline().strip()
        line_f = foreign.readline().strip()
        alignments = []
        for f in range(0, len(line_f.split(" "))):
            alignment = 0;
            probability = 0;
            for e in range(0, len(line_e.split(" "))):
                p = t[line_e.split(" ")[e]][line_f.split(" ")[f]]
                if probability < p:
                    probability = p
                    alignment = e+1
            # Check NULL alignment
            if probability < t["NULL"][line_f.split(" ")[f]]:
                probability = t["NULL"][line_f.split(" ")[f]]
                alignment = 0
            alignments.append(alignment)
        print(line_e, file=Align)
        print(line_f, file=Align)
        print(alignments, file=Align)
        print("", file=Align)
    source.close()
    foreign.close()
    Align.close()

In [45]:
def main():
    start_time = time.time()
    #Read in corpus, initialize t(f|e) for each unique English word-foreign word combo
    t = init_t("fr-en.en","fr-en.fr")
    #Run  5 iterations of the EM algorithm for IBM model 1
    model1("fr-en.en","fr-en.fr",t)
    #Find alignments for the first 20 sentence pairs in the training data, save to output
    model_alignments_fr_en(t)
    logging.warning("--- %s seconds ---" % (time.time() - start_time))

if __name__ == '__main__':     # if the function is the main function ...
    main()



In [21]:
file = open('Align','r') 
print(file.read()) 

Resumption of the session
Reprise de la session
[1, 2, 3, 4]

I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .
Je dÃ©clare reprise la session du Parlement europÃ©en qui avait Ã©tÃ© interrompue le vendredi 17 dÃ©cembre dernier et je vous renouvelle tous mes vux en espÃ©rant que vous avez passÃ© de bonnes vacances .
[1, 2, 2, 4, 5, 11, 9, 8, 22, 2, 13, 2, 4, 12, 13, 14, 12, 17, 1, 25, 2, 2, 2, 27, 24, 2, 33, 25, 2, 2, 6, 2, 2, 40]

Although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful .
Comme vous avez pu le constater , le grand " bogue de l' an 2000 " ne s' est pas produit . En revanche , les citoyens d' un certain nombre de nos pays ont Ã©tÃ© victimes de catastrophes naturelles qui ont vr

Problem 3: German to english Model

In [22]:
t = init_t("de-en.en","de-en.de")
Prob1 = pd.DataFrame(t)
Prob1

Unnamed: 0,of,the,Resumption,NULL,session,happy,year,festive,resumed,European,...,judiciary,Directorate-General,load,plea,decrease,judges,Does,pressure,reactions,authors
!,0.000218,0.000218,,0.000218,,0.000218,0.000218,,,0.000218,...,,,,,,,,,,
"""",0.000218,0.000218,,0.000218,,,,,,0.000218,...,,,,,,,,,,
%,0.000218,0.000218,,0.000218,,,0.000218,,,0.000218,...,,,,,,,,,,
(,0.000218,0.000218,,0.000218,,,,,,0.000218,...,,,,,,,,,,
),0.000218,0.000218,,0.000218,,,,,,0.000218,...,,,,,,,,,,
",",0.000218,0.000218,,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,...,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,,
-,0.000218,0.000218,,0.000218,,,0.000218,,,0.000218,...,,,,,,,,,,
-20,,0.000218,,0.000218,,,,,,,...,,,,,,,,,,
-40,,0.000218,,0.000218,,,,,,,...,,,,,,,,,,
-garantien,0.000218,,,0.000218,,,,,,,...,,,,,,,,,,


In [23]:
# Define model alignment for German -ENglish corpus and save file
def model_alignments_Ge_en(t):
    source = open("de-en.en", "r")
    foreign = open("de-en.de", "r")
    Align_G = open("Align-German", "w")
    for j in range(0,20):
        line_e = source.readline().strip()
        line_f = foreign.readline().strip()
        alignments = []
        for f in range(0, len(line_f.split(" "))):
            alignment = 0;
            probability = 0;
            for e in range(0, len(line_e.split(" "))):
                p = t[line_e.split(" ")[e]][line_f.split(" ")[f]]
                if probability < p:
                    probability = p
                    alignment = e+1
            # Check NULL alignment
            if probability < t["NULL"][line_f.split(" ")[f]]:
                probability = t["NULL"][line_f.split(" ")[f]]
                alignment = 0
            alignments.append(alignment)
        print(line_e, file=Align_G)
        print(line_f, file=Align_G)
        print(alignments, file=Align_G)
        print("", file=Align_G)
    source.close()
    foreign.close()
    Align_G.close()

In [24]:
def main():
    start_time = time.time()
    #Read in corpus, initialize t(f|e) for each unique English word-foreign word combo
    t = init_t("de-en.en","de-en.de")
    #Run  5 iterations of the EM algorithm for IBM model 1
    model1("de-en.en","de-en.de",t)
    #Find alignments for the first 20 sentence pairs in the training data, save to output
    model_alignments_Ge_en(t)
    logging.warning("--- %s seconds ---" % (time.time() - start_time))

if __name__ == '__main__':     # if the function is the main function ...
    main()




In [25]:
file_G = open('Align-German','r') 
print(file_G.read()) 

Resumption of the session
Wiederaufnahme der Sitzungsperiode
[1, 2, 4]

I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .
Ich erklÃ¤re die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des EuropÃ¤ischen Parlaments fÃ¼r wiederaufgenommen , wÃ¼nsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daÃŸ Sie schÃ¶ne Ferien hatten .
[1, 2, 4, 2, 12, 16, 24, 2, 14, 2, 5, 39, 8, 2, 14, 2, 16, 2, 25, 2, 2, 2, 2, 2, 17, 32, 16, 33, 25, 2, 2, 2, 0]

Although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful .
Wie Sie feststellen konnten , ist der gefÃ¼rchtete " Millenium-Bug " nicht eingetreten . Doch sind BÃ¼rger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrop