In [1]:
%matplotlib inline
import sys
import os

sys.path.insert(0, '../')
from config import *
from utils import *
from oov_candidates_preprocessing import *
ocp = oov_candidates_preprocessing()

assert("dclm" in tmp_dir)
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

In [2]:
from data_preprocessing import *

# get seed for data selection
# input: dev_ref_file, test_nbest_file
# output: proc_seed_in_domain
proc_seed_in_domain = tmp_dir+"stemmed_seed."+t+".dev_test."+yrv # no stopwords, no punctuations
get_seed_for_data_selection2(
    dev_ref_file, \
    test_nbest_file, \
    proc_seed_in_domain)
print('--------')
# get target for data selection
# input: wiki_dump_train
# output: doc_non_domain, proc_doc_non_domain
doc_non_domain = wiki_dump_train+".nonproc"
proc_doc_non_domain = wiki_dump_train+".proc" # no stopwords, no punctuations
get_target_for_data_selection2(
    wiki_dump_train, \
    doc_non_domain, \
    proc_doc_non_domain)
print('--------')
# select doc with high jaccard idx
# input: proc_seed_in_domain, doc_non_domain, proc_doc_non_domain, selection threshold, dclm_vocab
# output: selected_non_domain_doc_file, selected_non_domain_doc_with_unk_file
num_of_doc_to_select = 20000
selected_non_domain_doc_file = tmp_dir+"selected_non_domain_doc_file_"+str(num_of_doc_to_select) # for char-lm
selected_non_domain_doc_with_unk_file = tmp_dir+"selected_non_domain_doc_with_unk_file_"+str(num_of_doc_to_select) # for dclm
if not os.path.exists(selected_non_domain_doc_with_unk_file):
    oov_candidates_all = ocp.get_oov_candidates_from_extracted(oov_candidates_dir, ocp.get_lexicon_xml_path())
    eng_vocab = ocp.get_eng_vocab(eng_vocab_file)
    select_doc_with_high_jaccard_idx2(
        proc_seed_in_domain, \
        doc_non_domain, \
        proc_doc_non_domain, \
        num_of_doc_to_select, \
        train_ref_file, dev_ref_file, unseq_ref_file, test_1best_file, test_oov_file, oov_candidates_all, eng_vocab, \
        selected_non_domain_doc_file, \
        selected_non_domain_doc_with_unk_file)
print('--------')
# merge files with boundary
# input: train_ref_file, (unseq_ref_file), selected_non_domain_doc_file/selected_non_domain_doc_with_unk_file
# output: combined_selected_non_domain_and_in_domain_doc/combined_selected_non_domain_with_unk_and_in_domain_doc
combined_selected_non_domain_and_in_domain_doc = tmp_dir+"combined_selected_non_domain_"+str(num_of_doc_to_select)+"_and_in_domain_doc"
combined_selected_non_domain_with_unk_and_in_domain_doc = tmp_dir+"combined_selected_non_domain_with_unk_"+str(num_of_doc_to_select)+"_and_in_domain_doc"
if os.path.exists(unseq_ref_file):
    if not os.path.exists(combined_selected_non_domain_with_unk_and_in_domain_doc):
        merge_files_with_boundary([train_ref_file,unseq_ref_file,selected_non_domain_doc_file],combined_selected_non_domain_and_in_domain_doc, True)
        merge_files_with_boundary([train_ref_file,unseq_ref_file,selected_non_domain_doc_with_unk_file],combined_selected_non_domain_with_unk_and_in_domain_doc, True)
else:
    if not os.path.exists(combined_selected_non_domain_with_unk_and_in_domain_doc):
        merge_files_with_boundary([train_ref_file,selected_non_domain_doc_file],combined_selected_non_domain_and_in_domain_doc, True)
        merge_files_with_boundary([train_ref_file,selected_non_domain_doc_with_unk_file],combined_selected_non_domain_with_unk_and_in_domain_doc, True)
print('--------')
# train charlm
# input: combined_selected_non_domain_and_in_domain_doc, dev_ref_file, charlm_num_layer, charlm_input_dim, charlm_hidden_dim
# output: charlm_model_file, charlm_dict_file, charlm_ppl_file, charlm_log_file
charlm_num_layer = 2
charlm_input_dim = 48
charlm_hidden_dim = 48
charlm_model_file = tmp_dir+"models/"+'_'.join(["charlm",t,str(charlm_num_layer),str(charlm_input_dim),str(charlm_hidden_dim)])+".model"
charlm_dict_file = tmp_dir+"models/"+'_'.join(["charlm",t,str(charlm_num_layer),str(charlm_input_dim),str(charlm_hidden_dim)])+".dict"
charlm_ppl_file = tmp_dir+"models/"+'_'.join(["charlm",t,str(charlm_num_layer),str(charlm_input_dim),str(charlm_hidden_dim)])+".ppl"
charlm_log_file = tmp_dir+"models/"+'_'.join(["charlm",t,str(charlm_num_layer),str(charlm_input_dim),str(charlm_hidden_dim)])+".log"
train_charlm(
    combined_selected_non_domain_and_in_domain_doc, \
    dev_ref_file, \
    charlm_num_layer, \
    charlm_input_dim, \
    charlm_hidden_dim, \
    charlm_model_file, \
    charlm_dict_file, \
    charlm_ppl_file, \
    charlm_log_file)
print('--------')
# train dclm
# input: combined_selected_non_domain_with_unk_and_in_domain_doc, dev_ref_file, model_type, dclm_num_layer, dclm_input_dim, dclm_hidden_dim, dclm_align_dim, dclm_len_thresh
# output: dclm_model_file, dclm_dict_file, dclm_ppl_file, dclm_log_file
dclm_num_layer = 2
dclm_input_dim = 48
dclm_hidden_dim = 48
dclm_align_dim = 48 # only for adclm (however for convenience, this param is attached to every model name)
dclm_len_thresh = 4
for model_type in {"rnnlm", "adclm", "ccdclm", "codclm"}: 
    dclm_model_file = tmp_dir+"models/"+'_'.join([model_type,t,str(dclm_num_layer),str(dclm_input_dim),str(dclm_hidden_dim),str(dclm_align_dim),str(dclm_len_thresh)])+".model"
    dclm_dict_file = tmp_dir+"models/"+'_'.join([model_type,t,str(dclm_num_layer),str(dclm_input_dim),str(dclm_hidden_dim),str(dclm_align_dim),str(dclm_len_thresh)])+".dict"
    dclm_ppl_file = tmp_dir+"models/"+'_'.join([model_type,t,str(dclm_num_layer),str(dclm_input_dim),str(dclm_hidden_dim),str(dclm_align_dim),str(dclm_len_thresh)])+".ppl"
    dclm_log_file = tmp_dir+"models/"+'_'.join([model_type,t,str(dclm_num_layer),str(dclm_input_dim),str(dclm_hidden_dim),str(dclm_align_dim),str(dclm_len_thresh)])+".log"
    train_dclm(
        combined_selected_non_domain_with_unk_and_in_domain_doc, \
        dev_ref_file, \
        model_type, \
        dclm_num_layer, \
        dclm_input_dim, \
        dclm_hidden_dim, \
        dclm_align_dim, \
        dclm_len_thresh, \
        dclm_model_file, \
        dclm_dict_file, \
        dclm_ppl_file, \
        dclm_log_file)
print('--------')

test_nbest_text exists at: /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.05.il3-eng.y1r1.v2/oov_trans_dclm/text_nbest.eng.test.y1r1.v2

seed_for_lm_training_selection exists at: /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.05.il3-eng.y1r1.v2/oov_trans_dclm/seed.eng.dev_test.y1r1.v2

proc_seed_for_lm_training_selection exists at: /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.05.il3-eng.y1r1.v2/oov_trans_dclm/stemmed_seed.eng.dev_test.y1r1.v2

--------
out_file exists at: /home/ec2-user/kklab/data/wiki_dump/wikitext-103/wiki.train.tokens.nonproc

out_file_proc exists at: /home/ec2-user/kklab/data/wiki_dump/wikitext-103/wiki.train.tokens.proc

--------
--------
--------
cd /home/ec2-user/kklab/Projects/lrlp/scripts/oov_translate/method_dclm2; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ec2-user/kklab/src/boost_1_61_0/lib; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ec2-user/kklab/src/dynet/build/dynet; ./charlm --dynet-gpu-ids 3 --dynet-mem 9192,1024,512 train

In [4]:
from lattice_rescoring import *

# dev, test, syscomb, eval
for dataset in {"dev", "test", "syscomb", "eval"}:
    # extracted, extracted_aligned, eng_vocab, extracted_eng_vocab
    for candidate_source in {"extracted_eng_vocab"}:
        data_in_domain_xml, _, onebest_file, _, _, candidate_list_file = ocp.init(dataset, candidate_source)
        # adclm, ccdclm, codclm, rnnlm
        for model_type in {"adclm"}:#,"ccdclm","codclm","rnnlm"}:
            dclm_model_file = tmp_dir+"models/"+'_'.join([model_type,t,str(dclm_num_layer),str(dclm_input_dim),str(dclm_hidden_dim),str(dclm_align_dim),str(dclm_len_thresh)])+".model"
            dclm_dict_file = tmp_dir+"models/"+'_'.join([model_type,t,str(dclm_num_layer),str(dclm_input_dim),str(dclm_hidden_dim),str(dclm_align_dim),str(dclm_len_thresh)])+".dict"
            # beam (beam search), context (comparing contexts), embed (add hisorical embeddings)
            for decoder_type in {"context"}:
                # True, False 
                for include_charlm in {False}:
                    # rescore lattice
                    # input: ...
                    # output: res_file 
                    beam_size = 4
                    res_attr = "_".join([candidate_source, model_type, decoder_type, str(include_charlm)])
                    res_file = exp_dir+"translation/"+dataset+"/"+".".join([res_attr,t,dataset,yrv])
                    print('--------')
                    rescore_lattice(
                        onebest_file, \
                        candidate_list_file, \
                        model_type, \
                        dclm_num_layer, \
                        dclm_input_dim, \
                        dclm_hidden_dim, \
                        dclm_align_dim, \
                        dclm_model_file, \
                        dclm_dict_file, \
                        decoder_type, \
                        beam_size, \
                        include_charlm, \
                        charlm_model_file, \
                        charlm_dict_file, \
                        charlm_num_layer, \
                        charlm_input_dim, \
                        charlm_hidden_dim, \
                        res_file)
                    print('--------')
                    if os.path.exists(res_file):
                        res_file_xml = data_in_domain_dir+".".join([exp_handle+"-uw-oov", candidate_source, st, dataset, yrv, "xml"])
                        write_translation_to_xml(data_in_domain_xml, res_file, res_file_xml)
                        print('--------\n')
                    else:
                        print("res_file is not ready!")
                        print('--------\n')

building oov candidate dictionary...
oov candidate dictionary is built!
2 oov words have hypotheses.
2 unique oov words have hypotheses
candidate_list_file created at: /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.05.il3-eng.y1r1.v2/translation/test/oov/extracted.eng.test.y1r1.v2
--------
cd /home/ec2-user/kklab/Projects/lrlp/scripts/oov_translate/method_dclm2; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ec2-user/kklab/src/boost_1_61_0/lib; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/ec2-user/kklab/src/dynet/build/dynet; ./latticedec1 --dynet-mem 9192 2 48 48 48 4 /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.05.il3-eng.y1r1.v2/oov_trans_dclm/models/adclm_eng_2_48_48_48_4.model /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.05.il3-eng.y1r1.v2/oov_trans_dclm/models/adclm_eng_2_48_48_48_4.dict /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.05.il3-eng.y1r1.v2/translation/test/onebest.eng.test.y1r1.v2 /home/ec2-user/kklab/Projects/lrlp/experiment_2017.06.0