In [1]:
%matplotlib inline

import numpy as np
import kenlm
import os
import os.path
import heapq
import random
import math
import sys
import re

sys.path.insert(0, '/home/ec2-user/kklab/Projects/lrlp/scripts/oov_translate')
from config import *
from utils import *

In [2]:
### language model directory
tmp_dir = exp_dir+"oov_trans_lm/"
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

In [3]:
def train_lm(n_gram, train_data, lm_model):
    '''
    param:
        ngram: length of "markovity"
        train_data: path to training data
        lm_model: path to lm file
    return:
        language model in kenLM format
    '''
#     if os.path.exists(lm_model):
#         print("Language model exists at: "+lm_model)
#     else:
    stdout, stderr = sh(lm_builder+\
                        " -o "+str(ngram)+\
                        " < "+train_data+\
                        " > "+lm_model)
    print("Language model generated at: "+lm_model)
    return kenlm.Model(lm_model)


def get_perplexity(lm_model, test_data):
    '''
    param:
        lm_model: path to lm file
        test_data: path to test data
    return:
        perp_w_oov: perplexity including OOVs
        perp_wo_oov: perplexity excluding OOVs
        oov_num: nubmer of OOVs
        token_num: number of tokens
    '''    
    stdout, _ = sh(query_perplexity+' '+lm_model+\
                   ' < '+test_data+\
                   ' | tail -n 4')
    stdout = stdout.strip().split('\n')
    perp_w_oov = float(stdout[0].split(':')[1].strip())
    perp_wo_oov = float(stdout[1].split(':')[1].strip())
    oov_num = int(stdout[2].split(':')[1].strip())
    token_num = int(stdout[3].split(':')[1].strip())
    return perp_w_oov, perp_wo_oov, oov_num, token_num


def get_cross_entropy(lm, sent):
    '''
    param:
        lm: language model in kenLM format
        sent: stripped sentence
    return:
        cross entropy estimation based on
    https://courses.engr.illinois.edu/cs498jh/Slides/Lecture04.pdf
    '''
    return -lm.score(sent)*1.0/len(sent.split(' '))


def get_best_hyp_lm(lm, \
                    pre_hyp, \
                    tra_tok, \
                    oov_candidates, \
                    best_score, \
                    best_hyp):
    '''
    score sentences in a dfs fashion, such that space complexity is O(1)
    for ug_dict only
    params:
        lm: language model in kenLM format
        pre_hyp: tokenized hyp with oov translated preceding tra_tok
        tra_tok: list of tokens with oov un-translated
        oov_candidates: {oov:{candidate:score}}
        best_score: highest log probability
        best_hyp: hypothesis with highest log probability, in string
    return:
        best_score: highest log probability updated
        hyp: oov translated hyp in string format
    '''
    if len(tra_tok) == 1:
        tok = tra_tok[0]
        if tok in oov_candidates and tok not in oov_candidates[tok]:
            candidates = list(oov_candidates[tok].keys())
            for candidate in candidates:
                hyp = ' '.join(pre_hyp+[candidate])
                score = lm.score(hyp)
                if score > best_score:
                    best_score = score
                    best_hyp = hyp
                #print(score)
                #print(hyp)
            return best_score, best_hyp
        else:
            hyp = ' '.join(pre_hyp+tra_tok)
            score = lm.score(hyp)
            if score > best_score:
                best_score = score
                best_hyp = hyp
            #print(score)
            #print(hyp)
            return best_score, best_hyp
    else:
        tok = tra_tok[0]
        if tok in oov_candidates and tok not in oov_candidates[tok]:
            candidates = list(oov_candidates[tok].keys())
            for candidate in candidates:
                score, hyp = get_best_hyp_lm(lm, pre_hyp+[candidate], tra_tok[1:], oov_candidates, best_score, best_hyp)
                if score > best_score:
                    best_score = score
                    best_hyp = hyp
            return best_score, best_hyp
        else:
            return get_best_hyp_lm(lm, pre_hyp+[tok], tra_tok[1:], oov_candidates, best_score, best_hyp)


# DEPRECATED
def get_best_hyp_bylm(lm, all_sentences):
    '''
    pick the translation with the highest probability
    param:
        lm: language model in kenLM format
        all_sentences: list of sentences
    return:
        best_sent: a sentence from all_sentences that has the highest score by lm
    '''
    best_score = -math.inf
    best_sent = None
    for sent in all_sentences:
        scr = lm.score(sent)
        if scr > best_score:
            best_score = scr
            best_sent = sent
        #print(scr)
        #print(sent)
    return best_score, best_sent


# DEPRECATED
def get_best_hyp_est_lm(lm, tra_tok, oov_words_set, oov_candidates, candidate_source):
    '''
    for sentences with lots of oovs, we translate oovs one by one,
    for eng_vocab and ug_dict
    param:
        lm: language model in kenLM format
        tra_tok: list of tokens
        oov_words_set: set of oov words
        oov_candidates: {candidate:score}
    return:
        best translation in string format 
    '''
    tra_tok_new = list(tra_tok)
    for i in range(len(tra_tok)):
        candidates = None
        if (candidate_source == "eng_vocab") and (tra_tok[i] in oov_words_set):
            candidates = [candidate for candidate in oov_candidates if candidate not in tra_tok]
        elif (candidate_source == "ug_dict") and (tra_tok[i] in oov_candidates) and (tra_tok[i] not in oov_candidates[tra_tok[i]]):
            candidates = list(oov_candidates[tra_tok[i]].keys())
        
        if candidates != None:
            highest_score = -math.inf
            best_candidate = None
            for candidate in candidates:
                sent = list(tra_tok_new)
                sent[i] = candidate
                scr = lm.score(' '.join(sent))
                if scr > highest_score:
                    highest_score = scr
                    best_candidate = candidate
            tra_tok_new[i] = best_candidate
    return ' '.join(tra_tok_new)


def get_best_hyp_lattice_lm(lm_final_path, tra_tok, oov_pos, oov_candidates, candidate_source, lazy_dir):
    '''
    translate oov words by forming a word lattice and decoding using beam search
    params:
        lm_final_path: path to lm in kenLM format, not binary
        tra_tok: list of tokens
        oov_pos: list of oov positions in tra_tok
        oov_candidates: {oov:{candidate:score}}
        lazy_dir: direcotry in which to put all hypergraph files
    return:
        best translation in string format
    '''
    lazy_file = lazy_dir + '0'

    total_vertex_count = 2
    total_edge_count = 2

    lazy_text = []
    lazy_text.append('1')
    lazy_text.append('<s> |||')
    vertex_idx = 0
    for pos in range(len(tra_tok)):
        if pos not in oov_pos:
            lazy_text.append('1')
            lazy_text.append('['+str(vertex_idx)+'] '+tra_tok[pos]+' |||')
            total_vertex_count += 1
            total_edge_count += 1
            vertex_idx += 1
        else:
            if candidate_source == "ug_dict":
                candidates = oov_candidates[tra_tok[pos]]
            elif candidate_source == "eng_vocab":
                candidates = oov_candidates.keys()
            candidate_num = len(candidates)
            lazy_text.append(str(candidate_num))
            for candidate in candidates:         
                lazy_text.append('['+str(vertex_idx)+'] '+candidate+' |||')     
            total_vertex_count += 1
            total_edge_count += candidate_num
            vertex_idx += 1
    lazy_text.append('1')
    lazy_text.append('['+str(vertex_idx)+'] </s> |||')

    ### the tokens <s> and </s> should appear explicitly
    with open(lazy_file, 'w') as flazy:
        flazy.write('{} {}'.format(total_vertex_count, total_edge_count)+'\n')
        for line in lazy_text:
            flazy.write(line+'\n')

    beam_size = 100

    #sh("vim -c wq "+lazy_file)
    stdout, stderr = sh(hypergraph_dec+\
                        " -i "+lazy_dir+\
                        " -l "+lm_final_path+".binary"+\
                        " -K "+str(beam_size)+\
                        " -W LanguageModel=1.0 LanguageModel_OOV=0 WordPenalty=0")

    return stdout.split('\n')[0].split('|||')[1].strip()



# def get_best_long_trans_bylm(lm, tra_tok, oov_candidates):
#     '''
#     for sentences with lots of oovs, we translate oovs one by one,
#     for ug_dict
#     param:
#         lm: language model in kenLM format
#         tra_tok: list of tokens
#         oov_candidates: {oov:{candidate:score}}
#     return:
#         best translation in string format
#     '''
#     tra_tok_new = list(tra_tok)
#     for i in range(len(tra_tok)):
#         if tra_tok[i] in oov_candidates and tra_tok[i] not in oov_candidates[tra_tok[i]]:
#             candidates = list(oov_candidates[tra_tok[i]].keys())
            
#             all_hyp = []
#             for candidate in candidates:
#                 sent = list(tra_tok_new)
#                 sent[i] = candidate
#                 all_hyp.append(' '.join(sent))
            
#             best_candidate = get_best_hyp_bylm(lm, all_hyp)
#             tra_tok_new[i] = candidates[all_hyp.index(best_candidate)]
    
#     return ' '.join(tra_tok_new)



# def get_best_long_trans_bylm_eng_vocab(lm, tra_tok, oov_words_set, oov_candidates):
#     '''
#     for sentences with lots of oovs, we translate oovs one by one,
#     for eng_vocab
#     param:
#         lm: language model in kenLM format
#         tra_tok: list of tokens
#         oov_words_set: set of oov words
#         oov_candidates: {candidate:score}
#     return:
#         best translation in string format 
#     '''
#     tra_tok_new = list(tra_tok)
#     for i in range(len(tra_tok)):
#         if tra_tok[i] in oov_words_set:
#             ### candidate translation, excluding candidates that appear in context
#             candidates = [candidate for candidate in oov_candidates if candidate not in tra_tok]
            
#             highest_score = -math.inf
#             best_candidate = None
#             for candidate in candidates:
#                 sent = list(tra_tok_new)
#                 sent[i] = candidate
#                 scr = lm.score(' '.join(sent))
#                 if scr > highest_score:
#                     highest_score = scr
#                     best_candidate = candidate
#             tra_tok_new[i] = best_candidate
    
#     return ' '.join(tra_tok_new)


def get_lm_final(train_in_domain0, \
                 dev_in_domain0, \
                 train_non_domain_all0, \
                 language_model, \
                 lm_final_path):
    '''
    params:
        train_in_domain0: path to in domain training data
        dev_in_domain0: path to in domain dev data
        train_non_domain_all0: path to non domain training data
        language_model: <4>gram or neural
        lm_final_path: path to lm trained on {in domain training, non domain subset}
    return:
        None
    '''
    
    if "gram" in language_model:
        ngram = language_model.strip("gram")

    # -------- train language model on in-domain training data --------
    ### 1. in-domain training data
    #train_in_domain0 = train_in_domain_dir+"elisa."+st+".train."+yrv+".true."+t
    ### 2. in-domain language model path
    lm_in_domain_path = tmp_dir+"lm_"+language_model+"_in_domain"
    ### 3. training in-domain language model
    lm_in_domain = train_lm(ngram, train_in_domain0, lm_in_domain_path)
    print("----------------")
    
    # -------- train language model on non-domain training data --------
    ### 1. non-domain training data (subset)
    train_non_domain_sub = tmp_dir+train_non_domain_all0.split('/')[-1]+".subset"
    ### 2. non-domain language model path (subset)
    lm_non_domain_subset_path = tmp_dir+"lm_"+language_model+"_non_domain_subset_1"
    ### 3. non-domain language model training (subset)
    total_num_non_domain_sub = 500000
    #if not os.path.exists(train_non_domain_sub):
    random_sample(train_non_domain_all0, \
                  total_num_non_domain_sub, \
                  train_non_domain_sub)
    #else:
    #    print("Sample file exists at: "+train_non_domain_sub)
    lm_non_domain_subset = train_lm(ngram, train_non_domain_sub, lm_non_domain_subset_path)
    print("----------------")
    
    # -------- train language model on non-domain training sample --------
    ### 1. non-domain training data (random sample)
    train_non_domain = train_non_domain_sub+".sample"
    ### 2. non-domain language model path (random sample)
    lm_non_domain_path = tmp_dir+"lm_"+language_model+"_non_domain"
    ### 3. non-domain language model training (random sample)
    total_num_in_domain = get_file_length(train_in_domain0)
    #if not os.path.exists(train_non_domain):
    total_num_non_domain = random_sample(train_non_domain_sub, \
                                         total_num_in_domain, \
                                         train_non_domain)
    #else:
    #    print("Sample file exists at: "+train_non_domain)
    #    total_num_non_domain = get_file_length(train_non_domain)
    print("total_num_in_domain: "+str(total_num_in_domain))
    print("total_num_non_domain: "+str(total_num_non_domain))
    lm_non_domain = train_lm(ngram, train_non_domain, lm_non_domain_path)
    print("----------------")
    
    # -------- score each sentence in non-domain training --------
    ### by the difference in cross entropy,
    ### sort the sentences by their scores
    ### stored in a heap of score and sentence tuples

    denominator = 2.0
    cutoff_num_non_domain = total_num_non_domain_sub/denominator

    ### heap
    score_sent = []
    with open(train_non_domain_sub) as f:
        for n, line in enumerate(f):
            sent = line.rstrip()
            cross_entropy_in_domain = get_cross_entropy(lm_in_domain, sent)
            cross_entropy_non_domain = get_cross_entropy(lm_non_domain, sent)
            cross_entropy_diff = -(cross_entropy_in_domain - cross_entropy_non_domain)
            if len(score_sent) < cutoff_num_non_domain:
                heapq.heappush(score_sent, (cross_entropy_diff, sent))
            else:
                spilled = heapq.heappushpop(score_sent, (cross_entropy_diff, sent))
    print("1/"+str(denominator)+" of the of non-domain training data has been loaded to heap.")
    print("----------------")
    
    # -------- test language models trained on subsets of non-domain training on in-domain dev --------
    #dev_in_domain0 = dev_in_domain_dir+"elisa."+st+".train."+yrv+".true."+t
    perp = math.inf
    perp_subset, _, _, _ = get_perplexity(lm_non_domain_subset_path, dev_in_domain0)
    print("perplexity of the language model on in-domain dev data trained on "\
          +str(total_num_non_domain_sub)+" sentences selected from non-domain training data: "\
          +str(perp_subset))

    while (perp_subset <= perp) and (denominator < total_num_non_domain_sub/200):
        ### 1. non-domain training data (subset)
        train_non_domain_subset = train_non_domain_sub+"."+str(int(denominator))
        ### 2. non-domain language model path (subset)
        lm_non_domain_subset_path = tmp_dir+"lm_"+language_model+"_non_domain_subset_"+str(denominator)
        ### 3. non-domain language model training (subset)
        with open(train_non_domain_subset, 'w') as fw:
            for pair in score_sent:
                fw.write(pair[1]+'\n')
        print("1/"+str(denominator)+" of the non-domain training data fetched.")
        lm_non_domain_subset = train_lm(ngram, train_non_domain_subset, lm_non_domain_subset_path)

        print("----------------")

        ### perplexity on in-domain dev data
        perp_w_oov, _, _, _ = get_perplexity(lm_non_domain_subset_path, dev_in_domain0)
        perp = perp_subset
        perp_subset = perp_w_oov
        print("perplexity of in-domain dev data from the language model trained on "\
              +str(cutoff_num_non_domain)+" sentences selected from non-domain training data: "\
              +str(perp_w_oov))

        print("----------------")

        ### fetch top half of the non-domain training data sorted by cross-entropy diff
        ctr = 0
        cutoff_num_non_domain = cutoff_num_non_domain/2
        while ctr < cutoff_num_non_domain:
            spilled = heapq.heappop(score_sent)
            ctr += 1
        print(str(ctr)+" sentences sampled as a subset.")

        denominator *= 2


    denominator_final = int(denominator/4)
    train_non_domain_subset_final = train_non_domain_sub+"."+str(denominator_final)
    print("----------------")
    
    # -------- merge non-domain subset and in-domain training --------
    ###  1. in-domain + non-domain training data
    train_final = train_in_domain0+".final_"+language_model
    ### 2. in-domain + non-domain language model path
    #lm_final_path = tmp_dir+"lm_"+language_model+"_final"
    ### 3. in-domain + non-domain language model training
    merge_files([train_non_domain_subset_final, train_in_domain0], train_final)
    train_lm(ngram, train_final, lm_final_path)
    sh(build_binary+" "+lm_final_path+" "+lm_final_path+".binary")
    print("----------------")
    
    #return lm_final_path, train_non_domain_subset_final, train_final

        
def oov_trans_lm(candidate_source, \
                 add_aligned_oov, \
                 language_model, \
                 res_file, \
                 restrict_vocab):
    '''
    params:
        candidate_source: ug_dict or eng_vocab
        add_aligned_oov: True or False
        language_model: <4>gram, neural
        res_file: path to oov translation result
        restrict_vocab: True or False
    return:
        None
    '''
    #train_in_domain = train_in_domain_dir+"elisa."+st+".train."+yrv+".true."+t
    #dev_in_domain = dev_in_domain_dir+"elisa."+st+".dev."+yrv+".true."+t
    lm_final_path = tmp_dir+"lm_"+language_model+"_final"
    if restrict_vocab:
        lm_final_path = lm_final_path + "_restrict_vocab"
    
    if os.path.exists(lm_final_path+".binary"):
        print(language_model+" language model already exists at: "+lm_final_path+".binary")
    else:
        if not restrict_vocab:
            get_lm_final(train_in_domain, \
                         dev_in_domain, \
                         train_non_domain_all, \
                         language_model, \
                         lm_final_path)
        else:
            train_non_domain_all_restrict_vocab = tmp_dir+train_non_domain_all.split('/')[-1]+"_restrict_vocab"
            if not os.path.exists(train_non_domain_all_restrict_vocab):
                vocab_set = build_vocab()
                set_unk(train_non_domain_all, train_non_domain_all_restrict_vocab, vocab_set)
            else:
                print("UNK has been set in: "+train_non_domain_all_restrict_vocab)
            get_lm_final(train_in_domain, \
                         dev_in_domain, \
                         train_non_domain_all_restrict_vocab, \
                         language_model, \
                         lm_final_path)
            
    lm_final = kenlm.Model(lm_final_path+".binary")
    
    eng_vocab = None
    ug_dict = None
    if candidate_source == "ug_dict":
        ug_dict = get_ug_dict(oov_candidates_file, 0)
    elif candidate_source == "eng_vocab":
        eng_vocab = get_eng_vocab(eng_vocab_file)
    
    ### multithreading below
    multithread_routine([candidate_source, add_aligned_oov, language_model], \
                        res_file, \
                        [eng_vocab, ug_dict, lm_final, lm_final_path], \
                        tmp_dir, \
                        LM)

    
class LM (threading.Thread):
    def __init__(self, \
                 candidate_source, \
                 add_aligned_oov, \
                 language_model, \
                 res_file, \
                 ctr_lo, \
                 ctr_up, \
                 eng_vocab, \
                 ug_dict, \
                 lm_final, \
                 lm_final_path):
        '''
        params:
            method params: list
            res_file
            ctr_lo
            ctr_up
            cached resource: list
        return:
            an LM instance
        '''
        
        threading.Thread.__init__(self)
        
        ### method params
        self.candidate_source = candidate_source
        self.add_aligned_oov = add_aligned_oov
        self.language_model = language_model
        
        ### one thread writes to one temporary file, later to be merged
        self.res_file = res_file 
        
        ### lower and upper bounds of instance indices
        self.ctr_lo = ctr_lo
        self.ctr_up = ctr_up
        
        ### established, cached resources, passed as arguments from outside
        self.eng_vocab = eng_vocab
        self.ug_dict = ug_dict
        self.lm_final = lm_final
        self.lm_final_path = lm_final_path
        
    def run(self):
        ctr = 0
        with open(tra_file) as ft, \
        open(oov_file) as fo, \
        open(self.res_file, 'w') as fres:
            for l_tra in ft:
                l_oov = fo.readline()

                if ctr >= self.ctr_lo and ctr <= self.ctr_up:
                    ###
                    # tra_tok: tokenized translation with oov, with html unescaped
                    # oov_pos: oov word posistions
                    # context: context word positions
                    ###
                    tra_tok, oov_pos, context = get_context_oov_pos(l_tra, l_oov)
                    oov_words_set = set([tra_tok[i] for i in oov_pos])
                    context_words_set = set([tra_tok[i] for i in context])

                    ### get oov candidates
                    oov_candidates = get_oov_candidates_all(self.candidate_source, \
                                                            self.add_aligned_oov, \
                                                            self.ug_dict, \
                                                            self.eng_vocab, \
                                                            oov_words_set, \
                                                            context_words_set)

                    ### translate
                    if self.candidate_source == "ug_dict":
                        num_hyp = get_num_hyp(oov_candidates, tra_tok, oov_pos)
                        
                        if num_hyp <= 50**2.8:
                            _, best_trans = get_best_hyp_lm(self.lm_final, \
                                                            [], \
                                                            tra_tok, \
                                                            oov_candidates, \
                                                            -math.inf, \
                                                            ' '.join(tra_tok))
                            #all_sentences = get_all_sentences(tra_tok, oov_candidates)
                            #best_score, best_trans = get_best_hyp_bylm(lm_final, all_sentences)
                        else:
                            lazy_dir = tmp_dir+candidate_source+"_"+str(add_aligned_oov)+"_"+language_model+"/"
                            if not os.path.exists(lazy_dir):
                                os.makedirs(lazy_dir)
                            best_trans = get_best_hyp_lattice_lm(self.lm_final_path, \
                                                                 tra_tok, \
                                                                 oov_pos, \
                                                                 oov_candidates, \
                                                                 self.candidate_source, \
                                                                 lazy_dir)
#                             best_trans = get_best_hyp_est_lm(self.lm_final, \
#                                                              tra_tok, \
#                                                              oov_words_set, \
#                                                              oov_candidates, \
#                                                              self.candidate_source)

                    elif candidate_source == "eng_vocab":
                        lazy_dir = tmp_dir+candidate_source+"_"+str(add_aligned_oov)+"_"+language_model+"_"+str(ctr)+"/"
                        if not os.path.exists(lazy_dir):
                            os.makedirs(lazy_dir)
                        best_trans = get_best_hyp_lattice_lm(self.lm_final_path, \
                                                             tra_tok, \
                                                             oov_pos, \
                                                             oov_candidates, \
                                                             self.candidate_source, \
                                                             lazy_dir)
#                         num_hyp = len(oov_candidates)*len(oov_pos)
#                         best_trans = get_best_hyp_est_lm(self.lm_final, \
#                                                          tra_tok, \
#                                                          oov_words_set, \
#                                                          oov_candidates, \
#                                                          self.candidate_source)
                    print(ctr)
                    print(best_trans)
                    fres.write(best_trans+'\n')

                ctr += 1

In [4]:
# -------- hyperparameters specific to this method --------
### ug_dict or eng_vocab
candidate_source = "ug_dict"
### True or False, only meaningful when candidate_source == ug_dict
add_aligned_oov = False
### definition of constants and directories
ngram = 4
### 4gram or wordlevel or charlevel
language_model = str(ngram)+"gram"
### if restrict vocabulary (with train_ref, dev_ref, test_hyp, eng_vocab, and ug_dict)
restrict_vocab = True


# -------- write --------
res_file = ".".join([tra_file,\
                     "oovtranslated",\
                     candidate_source,\
                     "lm_"+language_model])
if candidate_source is "ug_dict":
    if add_aligned_oov:
        res_file = ".".join([tra_file,\
                             "oovtranslated",\
                             candidate_source+"_withAlignedOov",\
                             "lm_"+language_model])
    else:
        res_file = ".".join([tra_file,\
                             "oovtranslated",\
                             candidate_source+"_withoutAlignedOov",\
                             "lm_"+language_model])
if restrict_vocab:
    res_file += "_restrict_vocab"
        
# -------- translate --------
#oov_trans_lm(candidate_source, add_aligned_oov, language_model, res_file, restrict_vocab)

In [5]:
for tag in {"", "_restrict_vocab"}:
    lm=kenlm.Model(tmp_dir+"lm_"+language_model+"_final"+tag+".binary")
    print(tmp_dir+"lm_"+language_model+"_final"+tag+".binary")

    prpl_res = tra_file+".nbest."+language_model+tag

    ctr = 0
    ctr_res = 0
    num_pre="0"
    num_cur="0"
    best_prob = -math.inf
    best_sent = ""
    with open(n_best_file) as f, open(prpl_res, 'w') as fw:
        for line in f:
            l = line.split(" ||| ")
            num_cur = l[0]
            sent_str = l[1]
            prob = lm.score(sent_str)

            if num_cur == num_pre:
                if prob > best_prob:
                    best_prob = prob
                    best_sent = sent_str
            else:
                num_pre = num_cur
                ctr_res += 1
                print(best_sent)
                fw.write(best_sent+"\n")

                best_prob = -math.inf
                best_sent = ""
            ctr += 1
        print(best_sent)
        fw.write(best_sent+"\n")
        ctr_res += 1
    print("--------")
    print(str(ctr)+" sentences processed.")
    print(str(ctr_res)+" sentences selected.")

/home/ec2-user/kklab/Projects/lrlp/experiment_elisa.il3-eng.y1r1.v2/oov_trans_lm/lm_4gram_final.binary
a يورۇتۇلما 
plots . 
Interactive manipulating currently selected accessible for Console 
_ يوقىتىش L تاللانغاننى . 

Plugin with various methods of selecting accessibles quickly . 

Script ئۆچۈرۈلىدۇ current . 
_ Schema , M 
waste 




then ئاريۇق فوكۇسلانغان WIDGET 
% s brothers , or by label 
% ( rolename ) s index % ( parent num1 d does not match row and column ( num2 index % d 

. do not turn away from it . 
Japanese known . 
something مادرىدلىقلار . 
indeed they all a ? 
سۆزلەلەمسەن ئىتاليانچە ? 
these are nothing but a few ! &quot; 
ئاڭلىسا it would be happy . 
ئۇسسىدىم ) . 
&quot; thank you . &quot; &quot; you &apos; re welcome . &quot; 
I am I don &apos; t know him . 
تاماشا yesterday . 
ئۇنۇتماڭ بىلەتنى . 
and who recite it ? 
يازامسەن Diary ? 
يازامسىلى Diary ? 
كۈتۈۋاتىسەن ? - 
but سورىشىڭىز . 
but سورىشىڭلار . 
why do you ئىشخانىغا بارمىدىڭىز ? 
you are not ئىشخانىغا بارم

In [10]:
print(s)

yor
