# IBM Model 1 and 2 Implementations

In [56]:
# import libraries
import numpy as np
import re
import pickle
import os
import pandas as pd
import math
import time

#### Define paths

In [57]:
dutch_path = "./training_data_dutch.txt"
english_path = "./training_data_english.txt"
result_path = "./results"

#### Helper functions for storing and loading 

In [58]:
def save_dict(path , word , dic):
    pth = os.path.join(path , word + ".pickle")
    with open(pth , 'wb') as handle:
        pickle.dump(dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_dict(path , word):
    pth = os.path.join(path , word + ".pickle")
    with open(pth, 'rb') as handle:
        b = pickle.load(handle)
    return b

#### Reading Data from files

In [59]:
df_english = pd.read_fwf(english_path , header = None)
df_dutch = pd.read_fwf(dutch_path , header = None)
english_sent = df_english[0].values.tolist() #reading and storing the 1st 50000 lines of the dataset
dutch_sent = df_dutch[0].values.tolist()
del df_english
del df_dutch

#### Code to clean one sentence, punctutations are removed here

In [60]:
def clean(s):
    regex = re.compile('[^a-zA-Z ]')
    s =  regex.sub("" , s).strip().lower()
    arr = s.split(" ")
    new_arr = []
    for w in arr:
        if(w == "" or w == " "):
            continue
        else:
            new_arr.append(w)
    s = " ".join(new_arr)
    return s

#### Cleaning all the lines in the given file

In [61]:
def clean_corpus(sent_list1 , sent_list2):
    new_sent1 = []
    new_sent2 = []
    for sent1 , sent2 in zip(sent_list1 , sent_list2):
        s1 = clean(sent1)
        s2 = clean(sent2)
        if(len(s1) > 0 and len(s2) > 0):
            new_sent1.append(s1)
            new_sent2.append(s2)
    del sent_list1
    del sent_list2
    return new_sent1 , new_sent2

In [62]:
english_sent, dutch_sent = clean_corpus(english_sent, dutch_sent)

#### Class defining a language

In [63]:
class Lang:
    def __init__(self , name , word2index , index2word):
        self.name = name
        self.word2index = word2index
        self.index2word = index2word

#### Extracting the words from each line and mapping word to index as well as index to word 

In [64]:
def word_extractor(sent_list): 
    word2index = {}
    index2word = {}
    index = 0
    for sent in  sent_list:
        for word in sent.split(" "):
            if(word in word2index or word == " " or word == ""):
                pass
            else:
                index2word[index] = word
                word2index[word] = index
                index = index + 1
                
    return word2index , index2word

In [65]:
# e_word2index is the mapping from word to index for english language
e_word2index , e_index2word = word_extractor(english_sent)
d_word2index , d_index2word = word_extractor(dutch_sent)

# Here we instatiate the languages which are dutch and english 
english_lang = Lang("English" , e_word2index , e_index2word)
dutch_lang = Lang("Dutch" , d_word2index , d_index2word)


#### Initialize the probability matrix which stores the probabilities that words at rows and columns are matched to each other

In [66]:
def probablity_init(lang1 , lang2):  # lang1 to lang2
    total_words2 = len(lang2.word2index)
    prob = 1.0/total_words2
    prob_arr = np.full((len(lang1.word2index), len(lang2.word2index)), prob, dtype=float)
    return prob_arr
    

In [67]:
def get_word_arr(S):
    arr = S.split(" ")
    return arr



# Use to set which way to train and translate

In [68]:
#lang_1, lang_2 = dutch_lang, english_lang   #use this to train dutch to english
#sents_1, sents_2 = dutch_sent, english_sent

lang_1, lang_2 = english_lang, dutch_lang   #use this to train english to dutch
sents_1, sents_2 = english_sent, dutch_sent


### IBM Model 1 training - 1 iteration
#### It takes as input the language lang1 which is to be translated into language lang2, which is the language to be translated in.

In [69]:
def train_IBM1(lang1, lang2 , sents1 , sents2, prob_arr):   #lang1 to lang2

    count_arr = np.zeros((len(lang1.word2index), len(lang2.word2index)), dtype=float)
    total = np.zeros(len(lang1.word2index) , dtype = np.float32)
    
    for sent_index in range(len(sents1)):
        s_total = np.zeros(len(lang2.word2index) , dtype = np.float32)
        sents1_words = get_word_arr(sents1[sent_index])
        sents2_words = get_word_arr(sents2[sent_index])
        
        for w2 in sents2_words:
            w2_index = lang2.word2index[w2]
            s_total[w2_index] = 0
            for w1 in sents1_words:
                w1_index = lang1.word2index[w1]
                s_total[w2_index] += prob_arr[w1_index][w2_index]
                
        for w2 in sents2_words:
            w2_index = lang2.word2index[w2]
            for w1 in sents1_words:
                w1_index = lang1.word2index[w1]
                temp = prob_arr[w1_index][w2_index]/s_total[w2_index]
                count_arr[w1_index][w2_index] += temp
                total[w1_index] += temp
                
    break_loop = 1
    result = 0
    for w1 in lang1.word2index:
        w1_index = lang1.word2index[w1]
        for w2 in lang2.word2index:
            w2_index = lang2.word2index[w2]
            prev = prob_arr[w1_index][w2_index]
            prob_arr[w1_index][w2_index] = count_arr[w1_index][w2_index]/total[w1_index]
            new = prob_arr[w1_index][w2_index]
            delta = (new - prev)**2
            result  = result + delta
            
    if(result**0.5 < 0.5):
        break_loop = 0 
    print(result**0.5)
    if(break_loop == 1):
        redo = True
    else:
        redo = False
        
    return redo    

#### Initialize the probability array and call the train IBM1 function

In [72]:
redo = True
cnt = 0

prob_arr = probablity_init(lang_1 , lang_2)    #use this to train english to dutch

for i in range(10):     # while redo
    start_time = time.time()
    redo = train_IBM1(lang_1,lang_2, sents_1,sents_2, prob_arr)
    cnt = cnt + 1
    print("Iteration "+str(cnt))
    print("--- %s seconds ---" % (time.time() - start_time))

KeyboardInterrupt: 

In [18]:
def make_result_map(lang1, lang2, prob_arr):   #makes a map between the lang1 word and the correspondingly predicted lang2 word
    result ={}
    for i in range(prob_arr.shape[0]):
        maxi=-1
        word1=lang1.index2word[i]
        for j in range(prob_arr.shape[1]):
            if(prob_arr[i][j]>maxi):
                maxi = prob_arr[i][j]
                word2 = lang2.index2word[j]
        result[word1] = [word2, maxi]
    return result

####  Enter name of the model you want to save as IBM Model 1

In [73]:
model_name_IBM1 = input()  #Enter name of the IBM1 model mapping to save
result = make_result_map(lang_1, lang_2, prob_arr)
save_dict(result_path, model_name_IBM1, result)

tt1



### IBM Model 2 training -1 iteration
 It takes as input the language lang1 which is to be translated into language lang2, which is the language to be translated in.
 Also, the maximum length of a sentence in lang1 and lang2 is given as an input

In [21]:
def train_IBM2(lang1, lang2 , sents1 , sents2, prob_arr, max_len1, max_len2):   #lang1 to lang2

    count_arr = np.zeros((len(lang1.word2index), len(lang2.word2index)), dtype=float)
    total = np.zeros(len(lang1.word2index) , dtype = np.float32)
   
    count_arr_a = np.zeros((max_len1, max_len2, max_len1, max_len2), dtype=float)
    total_a = np.zeros((max_len2, max_len2, max_len1), dtype=float)
   
   
    for sent_index in range(len(sents1)):
        s_total = np.zeros(len(lang2.word2index) , dtype = np.float32)
        sents1_words = get_word_arr(sents1[sent_index])
        sents2_words = get_word_arr(sents2[sent_index])
        l1 = len(sents1_words)
        l2 = len(sents2_words)
        j = 0
        for w2 in sents2_words:
            w2_index = lang2.word2index[w2]
            s_total[w2_index] = 0
            i = 0
            for w1 in sents1_words:
                w1_index = lang1.word2index[w1]
                s_total[w2_index] += prob_arr[w1_index][w2_index] * a_mat[i][j][l1-1][l2-1]
                i = i + 1
            j = j + 1
           
        j = 0    
        for w2 in sents2_words:
            w2_index = lang2.word2index[w2]
            i = 0
            for w1 in sents1_words:
                w1_index = lang1.word2index[w1]
                temp = (prob_arr[w1_index][w2_index] * a_mat[i][j][l1-1][l2-1])/ s_total[w2_index]
                count_arr[w1_index][w2_index] += temp
                total[w1_index] += temp
                count_arr_a[i][j][l1-1][l2-1] += temp
                total_a[j][l2-1][l1-1] += temp
                i = i + 1
            j = j + 1
               
    break_loop = 1
    result = 0
    for w1 in lang1.word2index:
        w1_index = lang1.word2index[w1]
        for w2 in lang2.word2index:
            w2_index = lang2.word2index[w2]
            prev = prob_arr[w1_index][w2_index]
            prob_arr[w1_index][w2_index] = count_arr[w1_index][w2_index]/total[w1_index]
            new = prob_arr[w1_index][w2_index]
            delta = (new - prev)**2
            result  = result + delta
   
    for i in range(max_len1):
        for  j in range(max_len2):
            for l2 in range(max_len2):
                for l1 in range(max_len1):
                    if count_arr_a[i][j][l1][l2] == 0 :
                        continue
                    a_mat[i][j][l1][l2] = count_arr_a[i][j][l1][l2] / total_a[j][l2][l1]    
   
    if(result**0.5 < 0.5):
        break_loop = 0
        
    print(result**0.5)

    if(break_loop == 1):
        redo = True
    else:
        redo = False
       
    return redo    


#### Initialize matrix a_mat

In [22]:
def initialize_mat(lang1, lang2, max_len1, max_len2):
    a_mat = np.zeros((max_len1, max_len2, max_len1, max_len2), dtype=float)
    for l in range(max_len1):
        prob = 1/(l+1)
        a_mat[:,:,l,:] = prob
    return a_mat

#### Initialize the probability array and call the train IBM2 function

In [74]:
start_time = time.time()

max_l_1 = 0    #calculate maximum number of tokens in a sentence of foreign lang
for sent in sents_1:
    max_l_1=max(max_l_1, len(sent.split()))

max_l_2 = 0   #calculate maximum number of tokens in a sentence of known lang
for sent in sents_2:
    max_l_2=max(max_l_2, len(sent.split()))

redo = True
cnt = 0

a_mat = initialize_mat(lang_1, lang_2, max_l_1, max_l_2)
prob_arr = probablity_init(lang_1, lang_2)

for i in range(10):   # while redo
    start_time = time.time()
    redo = train_IBM2(lang_1, lang_2 , sents_1 , sents_2, prob_arr, max_l_1, max_l_2)
    cnt = cnt + 1
    print("Iteration "+str(cnt))
    print("--- %s seconds ---" % (time.time() - start_time))

KeyboardInterrupt: 

#### Enter name of this model that you want to save as IBM Model 2

In [55]:
model_name_IBM2 = input()   # Enter name of the IBM2 model mapping to save
result = make_result_map(lang_1, lang_2, prob_arr)
save_dict(result_path, model_name_IBM2, result)

e2d_2


# Testing Starts

In [45]:
result_name = input()  #Enter the mapping name you want to load

e2d_2


In [46]:
translations = load_dict(result_path, result_name)

In [47]:
def translate(sentence):  # lang1 to lang2
    temp = sentence.lower().split(" ")
    out = []
    for w in temp:
        w = str(w)
        regex = re.compile('[^1-9a-zA-Z ]')
        w =  regex.sub("" , w).strip().lower()
        if(w == " " or w == ""):
            continue
        if(w in translations):
            out.append(translations[w][0])
        else:
            out.append(w)
    out = " ".join(out)
    return out

# Cosine Similarity

In [48]:
def Intersection(lst1, lst2):  #intersection of 2 lists
    return len(list(set(lst1) & set(lst2))) 

def Union(lst1, lst2):   #union of 2 lists
    final_list = list(set().union(lst1, lst2)) 
    return final_list

def clean_score(S):   #clean the sentence
    S = S.split(" ")
    new_S = []
    for w in S:
        w = str(w)
        regex = re.compile('[^1-9a-zA-Z ]')
        w =  regex.sub("" , w).strip().lower()
        new_S.append(w)
    S = " ".join(new_S)
    return S

def cosine(doc1, doc2):
    
    words1 = clean_score(doc1.lower()).split()
    words2 = clean_score(doc2.lower()).split()
    allwords = Union(words1, words2)
    freq1={}
    for word in words1:
        freq1[word] = 0

    for word in words1:
        freq1[word] += 1

    freq2={}
    for word in words2:
        freq2[word] = 0

    for word in words2:
        freq2[word] += 1

    wt1={}
    for f in freq1:
        wt1[f] = 1 + math.log10(freq1[f])

    wt2={}
    for f in freq2:
        wt2[f] = 1 + math.log10(freq2[f])

    vec1=[]
    vec2=[]

    vec1norm = 0
    for word in allwords:
        if word in wt1:
            vec1.append(wt1[word])
            vec1norm += wt1[word]*wt1[word]
        else:
            vec1.append(0)
    vec1norm = math.sqrt(vec1norm)  
    vec2norm = 0
    for word in allwords:
        
        if word in wt2:
            vec2.append(wt2[word])
            vec2norm += wt2[word]*wt2[word]
        else:
            vec2.append(0)

    vec2norm = math.sqrt(vec2norm)
    cos = 0
    for i in range(0,len(vec1)):
        cos += vec1[i]*vec2[i]
        
    cos = cos / (vec1norm * vec2norm)
    return cos

# Jaccard Coefficent

In [49]:
def jaccard(doc1, doc2):
    words1 = clean_score(doc1.lower()).split()
    words2 = clean_score(doc2.lower()).split()
    
    allwords = Union(words1, words2)
    jacardCoeff = Intersection(words1,words2) / len(allwords)
    return jacardCoeff

### Give input the path of the document to be tested

In [75]:
test_data_path = input()
correct_trans_path = input()

./training_data_english.txt
./training_data_dutch.txt


### Translate the whole doc

In [51]:
def translate_training(lang1_path , lang2_path):   # lang1 to lang2
    df_lang1 = pd.read_fwf(lang1_path , header = None)
    df_lang2 = pd.read_fwf(lang2_path , header = None)
    lang1_sents = df_lang1[0].values.tolist()
    lang2_sents = df_lang2[0].values.tolist()
    lang1_out = []
    for lang1_sent in lang1_sents  :
        lang1_out.append(translate(lang1_sent))
    lang2_doc = " ".join(lang2_sents)
    lang1_doc = " ".join(lang1_out)
    
    cos = cosine(lang1_doc,lang2_doc)
    print("Cosine similarity of docs is " + str(cos))
    
    jac = jaccard(lang1_doc, lang2_doc)
    print("Jaccard coefficient of docs is " + str(jac))
    
    arr = [cos, jac]
    with open('resultdoc.txt', 'w', encoding="utf-8") as fileh:
        fileh.writelines("%s\n" % place for place in arr)
    
    return lang1_out

In [52]:
def translate_test(lang1_path):   # lang1 to lang2
    df_lang1 = pd.read_fwf(lang1_path , header = None)
    lang1_sents = df_lang1[0].values.tolist()
    lang1_out = []
    for lang1_sent in lang1_sents  :
        lang1_out.append(translate(lang1_sent))
    return lang1_out

In [53]:
output = translate_training(test_data_path, correct_trans_path)

Cosine similarity of docs is 0.7139667190750636
Jaccard coefficient of docs is 0.31899531668153436


### Save translated lines to 'output.txt'

In [15]:
with open('output.txt', 'w', encoding="utf-8") as filehandle:
    filehandle.writelines("%s\n" % place for place in output)