In [1]:
%matplotlib inline

import numpy as np
import sys
import os.path
import heapq
import random
import math
import theanolm

sys.path.insert(0, '/home/ec2-user/kklab/Projects/lrlp/scripts/oov_translate')
from utils import *

In [2]:
### language model directory
tmp_dir = exp_dir+"oov_trans_wordlevel/"
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

In [3]:
def train_lm(lm_model, \
             train_data, \
             dev_data, \
             vocab, \
             vocab_format, \
             arch, \
             learning_rate, \
             optimization_method, \
             stopping_condition, \
             validation_frequency, \
             patience):
    '''
    param:
        lm_model: path to lm file
        train_data: path to training data
    '''
    sh_realtime("theanolm train "+\
               lm_model+\
               " --training-set "+train_data+\
               " --validation-file "+dev_data+\
               " --vocabulary "+vocab+\
               " --vocabulary-format "+vocab_format+\
               " --architecture "+arch+\
               " --learning-rate "+learning_rate+\
               " --optimization-method "+\
               " --stopping-condition "+stopping_condition+\
               " --validation-frequency "+validation_frequency+\
               " --patience "+patience)

def find_prob(res_list):
    ptr = -2
    while 'log(p(</s> |' not in res_list[ptr]:
        ptr -= 1
    res = res_list[ptr]
    return float(res.split(' = ')[-1])

def score_sent(lm_model, all_sent):
    '''
    param:
        lm_model: path to lm file
        all_sent: path to file of all sentences to be scored
    return:
        scores: list of scores for sentences
    '''
    stdout, stderr = sh("theanolm score "+lm_model+" "+all_sent+" --output word-scores --log-base 10")
    stdout = stdout.split("\n\n")
    stdout = stdout[0:-1]
    scores = [find_prob(item.split('\n')) for item in stdout]
    return scores

def get_best_hyp_bynlm(lm_model, all_sentences, sent_tmp):
    '''
    param:
        lm_model: path to lm file
        all_sent: list of sentences
    return:
        best_sent: a sentence from all_sent that has the highest score by lm
    '''
    with open(sent_tmp, "w") as fw:
        for sent in all_sentences:
            fw.write(sent)
            fw.write('\n')
    scores = score_sent(lm_model, sent_tmp)
    best_index = scores.index(max(scores))
    return all_sentences[best_index], best_index

def get_best_long_trans_bynlm(lm_model, tra_tok, oov_candidates, sent_tmp):
    '''
    for sentences with lots of oovs, we translate oovs one by one,
    for ug_dict
    param:
        lm_model: path to lm file
        tra_tok: list of tokens
        oov_candidates: {oov:{candidate:score}}
    return:
        best translation in string format
    '''
    tra_tok_new = list(tra_tok)
    for i in range(len(tra_tok)):
        if tra_tok[i] in oov_candidates and tra_tok[i] not in oov_candidates[tra_tok[i]]:
            candidates = list(oov_candidates[tra_tok[i]].keys())
            
            all_hyp = []
            for candidate in candidates:
                sent = list(tra_tok_new)
                sent[i] = candidate
                all_hyp.append(' '.join(sent))
            
            _, best_index = get_best_hyp_bynlm(lm_model, all_hyp, sent_tmp)
            tra_tok_new[i] = candidates[best_index]
    
    return ' '.join(tra_tok_new)

def get_best_long_trans_bynlm_eng_vocab(lm_model, tra_tok, oov_words_set, oov_candidates, sent_tmp):
    '''
    for sentences with lots of oovs, we translate oovs one by one,
    for eng_vocab
    param:
        lm_model: path to lm file
        tra_tok: list of tokens
        oov_words_set: set of oov words
        oov_caniddates: {candidate:score}
    return:
        best translation in string format 
    '''
    tra_tok_new = list(tra_tok)
    for i in range(len(tra_tok)):
        if tra_tok[i] in oov_words_set:
            ### candidate translation, excluding candidates that appear in context
            candidates = [candidate for candidate in oov_candidates if candidate not in tra_tok]
            
            all_hyp = []
            for candidate in candidates:
                sent = list(tra_tok_new)
                sent[i] = candidate
                all_hyp.append(' '.join(sent))
            
            _, best_index = get_best_hyp_bynlm(lm_model, all_hyp, sent_tmp)
            best_candidate = candidates[best_index]
            tra_tok_new[i] = best_candidate
    
    return ' '.join(tra_tok_new)

def oov_trans_wordlevel(candidate_source, add_aligned_oov, res_file):
    lm_final_path = "/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/scripts/oov_translate/lm_neural/modelfun.h5"

    ug_dict = None
    eng_vocab = None
    if candidate_source == "ug_dict":
        ug_dict = get_ug_dict(oov_candidates_file, 0)
    elif candidate_source == "eng_vocab":
        eng_vocab = get_eng_vocab(eng_vocab_file)
    multithread_routine([candidate_source, add_aligned_oov], \
                        res_file, \
                        [eng_vocab, ug_dict, lm_final_path], \
                        tmp_dir, \
                        wordlevel)


class wordlevel (threading.Thread):
    def __init__ (self, \
                  candidate_source, \
                  add_aligned_oov, \
                  res_file, \
                  ctr_lo, \
                  ctr_up, \
                  eng_vocab, \
                  ug_dict, \
                  lm_final_path):
        threading.Thread.__init__(self)
        
        ### method params
        self.candidate_source = candidate_source
        self.add_aligned_oov = add_aligned_oov

        ### one thread writes to one temporary file, later to be merged
        self.res_file = res_file 
        
        ### lower and upper bounds of instance indices
        self.ctr_lo = ctr_lo
        self.ctr_up = ctr_up
        
        ### established, cached resources, passed as arguments from outside
        self.eng_vocab = eng_vocab
        self.ug_dict = ug_dict
        self.lm_final_path = lm_final_path
    
    def run(self):   
        ctr = 0
        with open(tra_file) as ft, \
        open(oov_file) as fo, \
        open(self.res_file, 'w') as fres:
            for l_tra in ft:
                l_oov = fo.readline()

                if ctr >= self.ctr_lo and ctr <= self.ctr_up:
                    
                    ###
                    # tra_tok: tokenized translation with oov, with html unescaped
                    # oov_pos: oov word posistions
                    # context: context word positions
                    ###
                    tra_tok, oov_pos, context = get_context_oov_pos(l_tra, l_oov)
                    oov_words_set = set([tra_tok[i] for i in oov_pos])
                    context_words_set = set([tra_tok[i] for i in context])

                    ### get oov candidates
                    oov_candidates = get_oov_candidates_all(self.candidate_source, \
                                                            self.add_aligned_oov, \
                                                            self.ug_dict, \
                                                            self.eng_vocab, \
                                                            oov_words_set, \
                                                            context_words_set)

                    ### translate
                    sent_tmp = tmp_dir+threading.current_thread().name+str(ctr)
                    if candidate_source == "ug_dict":
                        ### for test set: sentence 316 is super long
                        ### for dev set: sentence 305, 452 is super long
                        if dataset == "dev":
                            s = [305, 452]
                        elif dataset == "test":
                            s = [316]

                        if ctr not in s:
                            all_sentences = get_all_sentences(tra_tok, oov_candidates)
                            best_trans, _ = get_best_hyp_bynlm(self.lm_final_path, all_sentences, sent_tmp)
                        else:
                            best_trans = get_best_long_trans_bynlm(self.lm_final_path, tra_tok, oov_candidates, sent_tmp)
                    elif candidate_source == "eng_vocab":
                        best_trans = get_best_long_trans_bynlm_eng_vocab(self.lm_final_path, tra_tok, oov_words_set, oov_candidates, sent_tmp)

                    print(ctr)
                    print(best_trans)
                    fres.write(best_trans+'\n')

                ctr += 1

In [4]:
### ug_dict or eng_vocab
for candidate_source in ["ug_dict", "eng_vocab"]:
    if candidate_source == "ug_dict":
        ### True or False, only meaningful when using ug_dict
        for add_aligned_oov in [False, True]:
            ### 4gram or wordlevel or charlevel
            language_model = "wordlevel"

            # -------- write --------
            res_file = "/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/translation/"\
            +dataset+"/elisa.il3-eng.test.y1r1.v2.translated.eng.oovtranslated.eng_vocab.lm"
            if candidate_source == "ug_dict":
                res_file = "/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/translation/"\
                +dataset+"/elisa.il3-eng.test.y1r1.v2.translated.eng.oovtranslated.ug_dict_withoutAlignedOov.lm"
                if add_aligned_oov:
                    res_file = "/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/translation/"\
                +dataset+"/elisa.il3-eng.test.y1r1.v2.translated.eng.oovtranslated.ug_dict_withAlignedOov.lm"
            res_file += "_"+language_model

            # -------- translate --------
            oov_trans_wordlevel(candidate_source, add_aligned_oov, res_file)
    else:
        add_aligned_oov = False
        ### 4gram or wordlevel or charlevel
        language_model = "wordlevel"

        # -------- write --------
        res_file = "/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/translation/"\
        +dataset+"/elisa.il3-eng."+dataset+".y1r1.v2.translated.eng.oovtranslated.eng_vocab.lm"
        if candidate_source == "ug_dict":
            res_file = "/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/translation/"\
            +dataset+"/elisa.il3-eng."+dataset+".y1r1.v2.translated.eng.oovtranslated.ug_dict_withoutAlignedOov.lm"
            if add_aligned_oov:
                res_file = "/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/translation/"\
            +dataset+"/elisa.il3-eng."+dataset+".y1r1.v2.translated.eng.oovtranslated.ug_dict_withAlignedOov.lm"
        res_file += "_"+language_model

        # -------- translate --------
        oov_trans_wordlevel(candidate_source, add_aligned_oov, res_file)

Number of threads: 36
Number of sentences: 347
200
بارماسلىقىڭىز .
257
He gave eight foundation _ his rectitude .
10
WARN
248
He returned from the office .
239
there are some which compose ?
180
all the drip جازىسىدىكىلار .
120
and he has quchan .
40
ئىشلەشكە guinea big good .
284
the creation of sin .
30
and will wryneck and good .
275
what I ?
311
چىقىۋاتىدۇ wind .
140
wonder good .
170
ۋوگزالغا diary should give for ?
220
where are my watches ?
90
there is a beautiful s daughter .
338
Frets On Fire
190
if bench , it would have been better for a few .
80
_ storeroom كولۇمبۇسنى all of a son .
150
the two there that a car .
293
create .
0
and climb يورۇتۇلمىنىڭ colour ,
210
ئوقۇۋاتىدۇ a son .
329
my my food is cheap .
320
a group .
70
give ساتىراشخانىغا .
302
search !
230
landowner .
20
and beware lest سېۋەتتە penetration water .
60
upon the time when he saw it wriggling ئۇچقاندەك .
160
what did you go there for ?
50
ياشقا favor ten in six days .
110
He dirt ئىشلەيدۇ o’clock .
100
this