# Implementation of Cross Language Information Retrieval

Importing the necessary libraries for preprocessing the data

In [1]:
from nltk.tokenize import word_tokenize
import nltk
import string
import numpy as np
import math
from datetime import datetime
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Running the whole EM-algorithm for maximun of 10 iterations

In [0]:
max_num_of_iterations = 20 

def converge_limit(mat,mat_old,num_of_iterations):
    epsilon = 0.0000001
    if num_of_iterations > max_num_of_iterations :
        return True

    for i in range(len(mat)) :
        for j in range(len(mat[0])) :
            if math.fabs(mat[i][j] - mat_old[i][j]) > epsilon:
                return False
    return True

This is the EM step required in IBM Model 1.
EM Algorithm consists of two steps:
1. Expectation-Step: Apply model to the data
      using the model, assign probabilities to possible values
2. Maximization-Step: Estimate model from data
      (a) take assign values as fact
      (b) collect counts (weighted by probabilities)
      (c) estimate model from counts

We have also incorporated laplace smoothing as a way to improve accuracy of IBM model 1.

In [0]:
def ibm_model_1(dutch_sentences,english_sentences,dutch_word_dict,english_word_dict):
    print("\nIBMModel 1 Training")
    num_of_dut_word = len(dutch_word_dict)
    num_of_eng_word = len(english_word_dict)
    # EM algorithm
    t_e_f_mat = np.full((len(dutch_word_dict), len(english_word_dict)), 1 / len(english_word_dict), dtype=float)
    t_e_f_mat_temp = np.full((len(dutch_word_dict), len(english_word_dict)), 1, dtype=float)

    cnt_iter = 0
    while not converge_limit(t_e_f_mat,t_e_f_mat_temp,cnt_iter) :
        cnt_iter += 1
        print("Iteration : ", cnt_iter)
        t_e_f_mat_temp = t_e_f_mat.copy()
        count_e_f = np.full((len(dutch_word_dict), len(english_word_dict)), 0, dtype=float)
        total_f = np.full((len(english_word_dict)),0, dtype=float)

        print(str(datetime.now()))
        for idx_dut, dut_sen in enumerate(dutch_sentences): 
            # Compute Normalization
            dut_sen_words = dut_sen.split(" ")
            s_total = np.full((len(dut_sen_words)),0,dtype=float)
            for idx_word in range(len(dut_sen_words)): 
                dut_word = dut_sen_words[idx_word]
                if dut_word == '':
                    continue 
                s_total[idx_word] = 0
                eng_sen_words = english_sentences[idx_dut].split(" ")
                for eng_word in eng_sen_words: 
                    if eng_word == '' :
                        continue 
                    idx_dut_in_dict =dutch_word_dict[dut_word]
                    idx_eng_in_dict = english_word_dict[eng_word]
                    s_total[idx_word] += t_e_f_mat[idx_dut_in_dict][idx_eng_in_dict]
                

            # Collect Counts
            dut_sen_words = dut_sen.split(" ")
            for idx_word in range(len(dut_sen_words)):
                if dut_word == '':
                    continue 
                dut_word = dut_sen_words[idx_word]
                eng_sen_words = english_sentences[idx_dut].split(" ")
                for eng_word in eng_sen_words:
                    if eng_word == '' :
                        continue
                    idx_dut_in_dict =dutch_word_dict[dut_word]
                    idx_eng_in_dict = english_word_dict[eng_word]
                    count_e_f[idx_dut_in_dict][idx_eng_in_dict] += t_e_f_mat[idx_dut_in_dict][idx_eng_in_dict] / s_total[idx_word]
                    total_f[idx_eng_in_dict] += t_e_f_mat[idx_dut_in_dict][idx_eng_in_dict] / s_total[idx_word]
                
        # Estimate Probabilities without Laplace Smoothing
#         for eng_idx in  range(num_of_eng_word): 
#             for dut_idx in range(num_of_dut_word): 
#                 if count_e_f[dut_idx][eng_idx] != 0 :
#                     t_e_f_mat[dut_idx][eng_idx] = (count_e_f[dut_idx][eng_idx]) / (total_f[eng_idx]) 
        
        # Estimate Probabilities with Laplace Smoothing
        for eng_idx in  range(num_of_eng_word): #for all foreign words f do
            for dut_idx in range(num_of_dut_word): #for all English words e do
                if count_e_f[dut_idx][eng_idx] != 0 :
                    t_e_f_mat[dut_idx][eng_idx] = (count_e_f[dut_idx][eng_idx] + 1) / (total_f[eng_idx] + num_of_eng_word) 
            
        print(str(datetime.now()))
    
    print("IBMModel1 Training Complete !")
    return t_e_f_mat


The IBM Model 2 has an additional model for alignment that is not present in Model 1.The IBM Model 2 addressed this issue by modeling the translation of a foreign input word in position i to a native language word in position j using an alignment probability distribution defined as:

a(i or j, l_e, l_f)

We have also incorporated laplace smoothing as a way to improve accuracy of IBM model 2.

In [0]:
def ibm_model_2(t_e_f_mat,dutch_sentences,english_sentences,dutch_word_dict,english_word_dict,max_le,max_lf):
    print("\nIBMModel 2 Training ")
    a_i_le_lf_mat = np.zeros((max_lf, max_le, max_lf,max_le), dtype=float)

    for lf in range(max_lf):
        a_i_le_lf_mat[:,:,lf,:] = 1/(lf+1)

    num_of_e_word = len(dutch_word_dict)
    num_of_f_word = len(english_word_dict)

    t_e_f_mat_prev = np.full((num_of_e_word, num_of_f_word), 1,dtype=float)
    cnt_iter = 0

    while not converge_limit(t_e_f_mat,t_e_f_mat_prev,cnt_iter) :
        cnt_iter += 1
        print("Iteration : ", cnt_iter)
        t_e_f_mat_prev = t_e_f_mat.copy()
        count_e_f = np.full((num_of_e_word, num_of_f_word), 0, dtype=float)
        total_f = np.full((num_of_f_word),0, dtype=float)
        count_a_i_le_lf = np.zeros((max_lf, max_le, max_lf,max_le), dtype=float)
        total_a_j_le_lf = np.zeros((max_le,max_le,max_lf),dtype=float)

        print(str(datetime.now()))
        for idx_e, e_sen in enumerate(dutch_sentences):
            e_sen_words = e_sen.split(" ")
            f_sen_words = english_sentences[idx_e].split(" ")
            l_e = len(e_sen_words)
            l_f = len(f_sen_words)

            # Compute Normalization
            s_total = np.full((l_e),0,dtype=float)
            for j in range(l_e): 
                s_total[j] = 0 
                e_word = e_sen_words[j]
                for i in range(l_f): 
                    f_word = f_sen_words[i]
                    if e_word == '' :
                        continue
                    if f_word == '' :
                        continue
                    e_j = dutch_word_dict[e_word]
                    f_i = english_word_dict[f_word]
                    s_total[j] += t_e_f_mat[e_j][f_i] * a_i_le_lf_mat[i][j][l_f-1][l_e-1] 
                
            # Collect Counts
            for j in range(l_e): 
                e_word = e_sen_words[j]
                for i in range(l_f):
                    f_word = f_sen_words[i]
                    if e_word == '' :
                        continue
                    if f_word == '' :
                        continue
                    e_j = dutch_word_dict[e_word]
                    f_i = english_word_dict[f_word]

                    c = t_e_f_mat[e_j][f_i] * a_i_le_lf_mat[i][j][l_f-1][l_e-1] / s_total[j] 
                    count_e_f[e_j][f_i] += c
                    total_f[f_i] += c 
                    count_a_i_le_lf[i][j][l_f-1][l_e-1] += c 
                    total_a_j_le_lf[j][l_e-1][l_f-1] += c 

        # Estimate Probabilities without Smoothing
#         t_e_f_mat = np.full((num_of_e_word, num_of_f_word), 0,dtype=float) 
#         a_i_le_lf_mat = np.zeros((max_lf, max_le, max_lf,max_le), dtype=float) 
#         for f_idx in  range(num_of_f_word): 
#             for e_idx in range(num_of_e_word): 
#                 if count_e_f[e_idx][f_idx] != 0 :
#                     t_e_f_mat[e_idx][f_idx] = (count_e_f[e_idx][f_idx]) / (total_f[f_idx])
    
#         print(str(datetime.now()))
#         for i in range(max_lf):
#             for  j in range(max_le):
#                 for le in range(max_le):
#                     for lf in range(max_lf):
#                         if count_a_i_le_lf[i][j][lf][le] != 0 :
#                             a_i_le_lf_mat[i][j][lf][le] = (count_a_i_le_lf[i][j][lf][le]) / (total_a_j_le_lf[j][le][lf])
            
        # Estimate Probabilities with Smoothing
        t_e_f_mat = np.full((num_of_e_word, num_of_f_word), 0,dtype=float) 
        a_i_le_lf_mat = np.zeros((max_lf, max_le, max_lf,max_le), dtype=float) 
        for f_idx in  range(num_of_f_word): 
            for e_idx in range(num_of_e_word): 
                if count_e_f[e_idx][f_idx] != 0 :
                    t_e_f_mat[e_idx][f_idx] = (count_e_f[e_idx][f_idx]+1) / (total_f[f_idx]+num_of_e_word)
    
        print(str(datetime.now()))
        for i in range(max_lf):
            for  j in range(max_le):
                for le in range(max_le):
                    for lf in range(max_lf):
                        if count_a_i_le_lf[i][j][lf][le] != 0 :
                            a_i_le_lf_mat[i][j][lf][le] = (count_a_i_le_lf[i][j][lf][le]+1) / (total_a_j_le_lf[j][le][lf]+num_of_e_word)

    print(t_e_f_mat)
    print("IBMModel2 Training Complete !")
    return t_e_f_mat, a_i_le_lf_mat


This method tokenizes the sentences into lists after converting the sentence into lowercase and removing the punctuation marks.

In [0]:
def sentences_tokenized(max_index, sentences):
    token_sentences = list()
    word_dictionary = {} 
    rev_dictionary = {}
    lang_order = 0
    cnt = 0
    max_len_sentence = 0
    translate_table = dict((ord(char), None) for char in string.punctuation)
    for row in sentences[:max_index]:
        row = row.translate(translate_table)
        tokens = word_tokenize(row.lower())

        if len(tokens) > max_len_sentence :
            max_len_sentence = len(tokens)

        produced_sentence = ""
        for token in tokens:
            if token not in word_dictionary:
                word_dictionary[token] = lang_order
                rev_dictionary[lang_order] = token
                lang_order += 1
            produced_sentence = produced_sentence + token + " "
        produced_sentence = produced_sentence[:(len(produced_sentence) - 1)] 

        token_sentences.append(produced_sentence)


    return token_sentences, word_dictionary, rev_dictionary, max_len_sentence


This method trains our model by taking a portion of the given dataset( say 10000 sentences ).  

In [0]:
def train_models(train_model2):
    with open("Dutch_Updated.txt", encoding="utf8") as f:
            sentences_dut = f.readlines()
    with open("English_Updated.txt", encoding="utf8") as f:
            sentences_eng = f.readlines()

    sentences_dut_red = list()
    sentences_eng_red = list()
    for sen_idx in range(len(sentences_eng)):
        if sen_idx > 500000 :
            break
        cur_eng_sen = sentences_eng[sen_idx].split()
        if len(cur_eng_sen) < 11:
            sentences_dut_red.append(sentences_dut[sen_idx])
            sentences_eng_red.append(sentences_eng[sen_idx])
    
    sentences_eng = sentences_eng_red.copy()
    sentences_dut = sentences_dut_red.copy()

    num_of_train_samples = 10000

    # Tokenizing the Dutch Training Samples
    dutch_sentences, dutch_word_dict, opp_dutch_word_dict, max_le = sentences_tokenized(num_of_train_samples, sentences_dut)

    # Tokenizing the English Training Samples
    english_sentences, english_word_dict, opp_english_word_dict, max_lf = sentences_tokenized(num_of_train_samples, sentences_eng)

    np.save("models/dut_word_dict",dutch_word_dict)
    np.save("models/eng_word_dict",english_word_dict)
    
    t_e_f  = ibm_model_1(dutch_sentences,english_sentences,dutch_word_dict,english_word_dict)

    print(t_e_f.shape)
    
    num_of_dut_word = t_e_f.shape[0]
    num_of_eng_word = t_e_f.shape[1]
    Dict_eng = {}
    Dict_dut = {}
    
    for eng_idx in range(num_of_eng_word): 
        maximum = -1
        i = 0
        for dut_idx in range(num_of_dut_word): 
            if t_e_f[dut_idx][eng_idx] > maximum : 
                maximum = t_e_f[dut_idx][eng_idx]
                i = dut_idx

        Dict_eng[opp_english_word_dict[eng_idx]] = opp_dutch_word_dict[i]
        
    for dut_idx in range(num_of_dut_word): 
        maximum = -1
        i = 0
        for eng_idx in range(num_of_eng_word):
            if t_e_f[dut_idx][eng_idx] > maximum : 
                maximum = t_e_f[dut_idx][eng_idx]
                i = eng_idx

        Dict_dut[opp_dutch_word_dict[dut_idx]] = opp_english_word_dict[i]
        
    np.save("models/t_e_f_model1",t_e_f)
    np.save("models/dut_max_word_dict_1",Dict_dut)
    np.save("models/eng_max_word_dict_1",Dict_eng)
    
    if train_model2 == True :
        t_e_f, a_i_le_lf_mat = ibm_model_2(t_e_f,dutch_sentences,english_sentences,dutch_word_dict,english_word_dict,max_le,max_lf)
        np.save("models/t_e_f_model2",t_e_f)
        np.save("models/a_i_le_lf_model2",a_i_le_lf_mat)
        num_of_dut_word = t_e_f.shape[0]
        num_of_eng_word = t_e_f.shape[1]
        Dict_eng = {}
        Dict_dut = {}

        for eng_idx in range(num_of_eng_word): 
            maximum = -1
            i = 0
            for dut_idx in range(num_of_dut_word): 
                if t_e_f[dut_idx][eng_idx] > maximum : 
                    maximum = t_e_f[dut_idx][eng_idx]
                    i = dut_idx

            Dict_eng[opp_english_word_dict[eng_idx]] = opp_dutch_word_dict[i]

        for dut_idx in range(num_of_dut_word): 
            maximum = -1
            i = 0
            for eng_idx in range(num_of_eng_word):
                if t_e_f[dut_idx][eng_idx] > maximum : 
                    maximum = t_e_f[dut_idx][eng_idx]
                    i = eng_idx

            Dict_dut[opp_dutch_word_dict[dut_idx]] = opp_english_word_dict[i]
        np.save("models/dut_max_word_dict_2",Dict_dut)
        np.save("models/eng_max_word_dict_2",Dict_eng)


This segment of code returns the translated sentence. 


In [0]:
def get_tokens_of_sentence(sentence):
    translate_table = dict((ord(char), None) for char in string.punctuation)
    sentence = sentence.translate(translate_table)
    tokens = word_tokenize(sentence.lower())
    
    return tokens

def testing_sentence(sentence_to_translate,max_word_dict):

    f_sentence = get_tokens_of_sentence(sentence_to_translate)
    e_sentence = ""
    for word in f_sentence :
        if word in max_word_dict:
            e_sentence = e_sentence + max_word_dict[word] + " "
        else:
            print("word '"+ word +"' is not found in target language dictionary")
            continue

    return e_sentence


The method below calculates the cosine similarity and jaccard coefficient between any two sentences.

In [0]:
def vector_similarity(string1, string2,len_dict):  

    translate_table = dict((ord(char), None) for char in string.punctuation)
    string2 = string2.translate(translate_table)
    
    X_list = word_tokenize(string1)  
    Y_list = word_tokenize(string2.lower())
       
    X =[];Y =[]
     
    X_set = {w for w in X_list }  
    Y_set = {w for w in Y_list }
     
    rvector = X_set.union(Y_set)  
    for w in rvector:
        if w in X_set: 
            X.append(1) 
        else: 
            X.append(0)
        if w in Y_set:
            Y.append(1)
        else: 
            Y.append(0)
    
    len_dict = len(rvector)
    n = len(X)
    sum_X = 0
    sum_Y = 0
    sum_XY = 0
    squareSum_X = 0
    squareSum_Y = 0
     
    i = 0
    while i < n :
        sum_X = sum_X + X[i]         
        sum_Y = sum_Y + Y[i]
        sum_XY = sum_XY + X[i] * Y[i]
         
        squareSum_X = squareSum_X + X[i] * X[i]
        squareSum_Y = squareSum_Y + Y[i] * Y[i]
         
        i = i + 1
    
    c = 0
#     if (len_dict * squareSum_X - sum_X * sum_X) == 0 or (len_dict * squareSum_Y - sum_Y * sum_Y) == 0 :
#         corr = 1 
#     else :    
#         corr = (float)(len_dict * sum_XY - sum_X * sum_Y)/(float)(math.sqrt((len_dict * squareSum_X - sum_X * sum_X) * (len_dict * squareSum_Y - sum_Y * sum_Y)))        
    
    jac = float(len(X_set & Y_set)) / len(X_set | Y_set)
    
    for i in range(len(rvector)):
        c += X[i]*Y[i]
    if sum(X) == 0 or sum(Y) == 0 :
        cosine = 0
        print("test")
    else :    
        cosine = c / float((sum(X)*sum(Y))**0.5)
    
    print("Cosine Similarity: ", cosine)
    print("Jaccard Coefficient: ", jac)
    
    return cosine , jac


The method below is used for testing our model on training and validation dataset. It takes into account the direction of translation as specified by the user(dutch to english or english to dutch). 

In [0]:
def test_model(lang1_to_lang2, type_set, model_no):

    e_word_dict = np.load("models/dut_word_dict.npy",allow_pickle = True).item()
    f_word_dict = np.load("models/eng_word_dict.npy",allow_pickle = True).item()
    
    len_eng_dict = len(e_word_dict)
    len_dut_dict = len(f_word_dict)
    
    if model_no == 1 :
        e_max_word_dict = np.load("models/dut_max_word_dict_1.npy",allow_pickle = True).item()
        f_max_word_dict = np.load("models/eng_max_word_dict_1.npy",allow_pickle = True).item()
    elif model_no == 2 :
        e_max_word_dict = np.load("models/dut_max_word_dict_2.npy",allow_pickle = True).item()
        f_max_word_dict = np.load("models/eng_max_word_dict_2.npy",allow_pickle = True).item()
        
    no_of_train_samples = 10000

    if lang1_to_lang2 == "eng_to_dut" and type_set == 2 :
        with open("Dutch_Updated.txt", encoding="utf8") as f:
                sentences_dut = f.readlines()
        with open("English_Updated.txt", encoding="utf8") as f:
                sentences_eng = f.readlines()
        
        sentences_dut_red = list()
        sentences_eng_red = list()
        for sen_idx in range(len(sentences_eng)):
            if sen_idx > 500000 :
                break
            cur_eng_sen = sentences_eng[sen_idx].split()
            if len(cur_eng_sen) < 11:
                sentences_dut_red.append(sentences_dut[sen_idx])
                sentences_eng_red.append(sentences_eng[sen_idx])

        cos_sim = 0 
        jac_sim = 0
        i = no_of_train_samples - 1
        no_of_samples = 100
        for sentence in sentences_eng[no_of_train_samples:(no_of_train_samples + no_of_samples)]:
            i = i+1
            sen_temp = testing_sentence(sentence,f_max_word_dict)
            cos_temp,jac_temp = vector_similarity(sen_temp,sentences_dut[i],len_dut_dict)
            if cos_temp == 0 :
                print(sentences_dut[i])
                print(sen_temp)
            cos_sim = cos_sim + cos_temp
            jac_sim = jac_sim + jac_temp
        
        cos_sim = cos_sim/no_of_samples
        jac_sim = jac_sim/no_of_samples
        print("Average Cosine Similiarity for Validation Set" ,cos_sim)
        print("Average Jaccard Coefficient for Validation Set" ,jac_sim)
        
    elif lang1_to_lang2 == "dut_to_eng" and type_set == 2 :
        with open("Dutch_Updated.txt", encoding="utf8") as f:
                sentences_dut = f.readlines()
        with open("English_Updated.txt", encoding="utf8") as f:
                sentences_eng = f.readlines()
        
        sentences_dut_red = list()
        sentences_eng_red = list()
        for sen_idx in range(len(sentences_dut)):
            if sen_idx > 500000 :
                break
            cur_dut_sen = sentences_dut[sen_idx].split()
            if len(cur_dut_sen) < 11:
                sentences_dut_red.append(sentences_dut[sen_idx])
                sentences_eng_red.append(sentences_eng[sen_idx])

        cos_sim = 0 
        jac_sim = 0
        i = no_of_train_samples - 1
        no_of_samples = 100
        for sentence in sentences_dut[no_of_train_samples:(no_of_train_samples + no_of_samples)]:
            i = i+1
            sen_temp = testing_sentence(sentence,e_max_word_dict)
            cos_temp,jac_temp = vector_similarity(sen_temp,sentences_eng[i],len_eng_dict)
            if cos_temp == 0 :
                print(sentences_eng[i])
                print(sen_temp)
            cos_sim = cos_sim + cos_temp
            jac_sim = jac_sim + jac_temp
        
        cos_sim = cos_sim/no_of_samples
        jac_sim = jac_sim/no_of_samples
        print("Average Cosine Similiarity for Validation Set" ,cos_sim)
        print("Average Jaccard Coefficient for Validation Set" ,jac_sim)
        
    elif lang1_to_lang2 == "eng_to_dut" and type_set == 1 :
        with open("Dutch_Updated.txt", encoding="utf8") as f:
                sentences_dut = f.readlines()
        with open("English_Updated.txt", encoding="utf8") as f:
                sentences_eng = f.readlines()
        
        sentences_dut_red = list()
        sentences_eng_red = list()
        for sen_idx in range(len(sentences_eng)):
            if sen_idx > 500000 :
                break
            cur_eng_sen = sentences_eng[sen_idx].split()
            if len(cur_eng_sen) < 11:
                sentences_dut_red.append(sentences_dut[sen_idx])
                sentences_eng_red.append(sentences_eng[sen_idx])

        cos_sim = 0 
        jac_sim = 0
        i = -1
        no_of_samples = 100
        for sentence in sentences_eng[ 0 : no_of_train_samples ]:
            i = i+1
            sen_temp = testing_sentence(sentence,f_max_word_dict)
            cos_temp,jac_temp = vector_similarity(sen_temp,sentences_dut[i],len_dut_dict)
            if cos_temp == 0 :
                print(sentences_dut[i])
                print(sen_temp)
            cos_sim = cos_sim + cos_temp
            jac_sim = jac_sim + jac_temp
        
        cos_sim = cos_sim/no_of_train_samples
        jac_sim = jac_sim/no_of_train_samples
        print("Average Cosine Similiarity for Training Set" ,cos_sim)
        print("Average Jaccard Coefficient for Training Set" ,jac_sim)
    
    elif lang1_to_lang2 == "dut_to_eng" and type_set == 1 :
        with open("Dutch_Updated.txt", encoding="utf8") as f:
                sentences_dut = f.readlines()
        with open("English_Updated.txt", encoding="utf8") as f:
                sentences_eng = f.readlines()
        
        sentences_dut_red = list()
        sentences_eng_red = list()
        for sen_idx in range(len(sentences_dut)):
            if sen_idx > 500000 :
                break
            cur_dut_sen = sentences_dut[sen_idx].split()
            if len(cur_dut_sen) < 11:
                sentences_dut_red.append(sentences_dut[sen_idx])
                sentences_eng_red.append(sentences_eng[sen_idx])

        cos_sim = 0 
        jac_sim = 0
        i = -1
        no_of_samples = 100
        for sentence in sentences_dut[ 0 : no_of_train_samples ]:
            i = i+1
            sen_temp = testing_sentence(sentence,e_max_word_dict)
            cos_temp,jac_temp = vector_similarity(sen_temp,sentences_eng[i],len_eng_dict)
            if cos_temp == 0 :
                print(sentences_eng[i])
                print(sen_temp)
            cos_sim = cos_sim + cos_temp
            jac_sim = jac_sim + jac_temp
        
        cos_sim = cos_sim/no_of_train_samples
        jac_sim = jac_sim/no_of_train_samples
        print("Average Cosine Similiarity for Training Set" ,cos_sim)
        print("Average Jaccard Coefficient for Training Set" ,jac_sim)



The method below is used for testing our model on unknown dataset. It takes into account the direction of translation as specified by the user(dutch to english or english to dutch).

In [0]:
def test_model_new(lang1_to_lang2,dutch_file,english_file,model_no):

    e_word_dict = np.load("models/dut_word_dict.npy",allow_pickle = True).item()
    f_word_dict = np.load("models/eng_word_dict.npy",allow_pickle = True).item()
   
    len_eng_dict = len(e_word_dict)
    len_dut_dict = len(f_word_dict)
   
    if model_no == 1 :
        e_max_word_dict = np.load("models/dut_max_word_dict_1.npy",allow_pickle = True).item()
        f_max_word_dict = np.load("models/eng_max_word_dict_1.npy",allow_pickle = True).item()
    elif model_no == 2 :
        e_max_word_dict = np.load("models/dut_max_word_dict_2.npy",allow_pickle = True).item()
        f_max_word_dict = np.load("models/eng_max_word_dict_2.npy",allow_pickle = True).item()
       
    if lang1_to_lang2 == "eng_to_dut" :
        with open(dutch_file, encoding="utf8") as f:
                sentences_dut = f.readlines()
        with open(english_file, encoding="utf8") as f:
                sentences_eng = f.readlines()
       
        cos_sim = 0
        jac_sim = 0
        i = 0
        for sentence in sentences_eng :
            i = i+1
            sen_temp = sentence_tester(sentence,f_max_word_dict)
            cos_temp,jac_temp = vector_acc(sen_temp,sentences_dut[i],len_dut_dict)
            if cos_temp == 0 :
                print(sentences_dut[i])
                print(sen_temp)
            cos_sim = cos_sim + cos_temp
            jac_sim = jac_sim + jac_temp
       
        cos_sim = cos_sim/i
        jac_sim = jac_sim/i
        print("Average Cosine Similiarity for Test Set" ,cos_sim)
        print("Average Jaccard Coefficient for Test Set" ,jac_sim)
    elif lang1_to_lang2 == "eng_to_dut" :
        with open(dutch_file, encoding="utf8") as f:
                sentences_dut = f.readlines()
        with open(english_file, encoding="utf8") as f:
                sentences_eng = f.readlines()

        cos_sim = 0
        jac_sim = 0
        i = 0
        for sentence in sentences_dut :
            i = i+1
            sen_temp = sentence_tester(sentence,e_max_word_dict)
            cos_temp,jac_temp = vector_acc(sen_temp,sentences_eng[i],len_eng_dict)
            if cos_temp == 0 :
                print(sentences_eng[i])
                print(sen_temp)
            cos_sim = cos_sim + cos_temp
            jac_sim = jac_sim + jac_temp

        cos_sim = cos_sim/i
        jac_sim = jac_sim/i
        print("Average Cosine Similiarity for Test Set" ,cos_sim)
        print("Average Jaccard Coefficient for Test Set" ,jac_sim)  


This is the program interface as visible to the user. Once the user runs the code, he/she is prompted to enter a choice between 1 to 4. Choice 1 allows the user to train the data, choice 2 allows for testing From English to Dutch, choice 3 allows for testing From Dutch to English. Additionally, by choosing option 4, user can exit the program.
Upon the selection of either option 2 or 3, the user is allowed to choose a model of his/her choice - IBM model 1 or IBM model 2.


In [22]:
while True:
    try:
        mode = int(input('\nEnter your choice: \n\t1: Training \n\t2: Testing From English to Dutch \n\t3: Testing From Dutch to English \n\t4: Testing for Test Set \n\t5:Exit\n'))
    except ValueError:
        print ("Not a number")

    if mode == 1:
        train_model2 = int(input("\nEnter choice of training: \n\t1:Both Model 1 and 2 \n\t2:Only Model1 \n"))
        if train_model2 == 1 :
            train_models(True)
        elif train_model2 == 2 :
            train_models(False)
    elif mode == 2:
        type_set = int(input("\nEnter the type of dataset to be tested: \n\t1: Training \n\t2: Validation \n"))
        lang1_to_lang2 = "eng_to_dut"
        model_no = int(input("\nEnter the Model to be tested: \n\t1:IBM Model 1 \n\t2:IBM Model 2 \n"))
        test_model(lang1_to_lang2,type_set,model_no)
    elif mode == 3:
        type_set = int(input("\nEnter the type of dataset to be tested: \n\t1: Training \n\t2: Validation \n"))
        lang1_to_lang2 = "dut_to_eng"
        model_no = int(input("\nEnter the Model to be tested: \n\t1:IBM Model 1 \n\t2:IBM Model 2 \n"))        
        test_model(lang1_to_lang2,type_set,model_no)
    elif mode == 4:
        dutch_file = input("\nEnter the name of Dutch file")
        english_file = input("\nEnter the name of English file")
        lang = input("\nEnter the language to be translated from \n\tEnglish - eng \n\tDutch - dut")
        if lang == "eng" :
            lang1_to_lang2 = "eng_to_dut"
        elif lang == "dut" :
            lang1_to_lang2 = "dut_to_eng"
        model_no = int(input("\nEnter the Model to be tested: \n\t1:IBM Model 1 \n\t2:IBM Model 2 \n"))
        test_model_new(lang1_to_lang2,dutch_file,english_file,model_no)  
    elif mode == 5:
        break
    else:
        print("Invalid Choice")

print("End.")


Enter your choice: 
	1: Training 
	2: Testing From English to Dutch 
	3: Testing From Dutch to English 
	4: Testing for Test Set 
	5:Exit
2

Enter the type of dataset to be tested: 
	1: Training 
	2: Validation 
2

Enter the Model to be tested: 
	1:IBM Model 1 
	2:IBM Model 2 
1
Cosine Similarity:  0.0
Jaccard Coefficient:  0.0
In de allerlaatste dagen heeft de landbouwsector op vele punten zijn bezorgdheid geuit.

u kan alleen bord als u hebben een adequate vraag van schoon water 
Cosine Similarity:  0.44177063089637436
Jaccard Coefficient:  0.275
Cosine Similarity:  0.4334607234315054
Jaccard Coefficient:  0.275
Cosine Similarity:  0.5809475019311126
Jaccard Coefficient:  0.4090909090909091
Cosine Similarity:  0.4364357804719847
Jaccard Coefficient:  0.27906976744186046
Cosine Similarity:  0.5270462766947299
Jaccard Coefficient:  0.35714285714285715
Cosine Similarity:  0.45907808504876707
Jaccard Coefficient:  0.2978723404255319
word 'overcoming' is not found in target language dict