In [1]:
import string
import re
import os
import sys
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

from utils import *
from models import *
import yaml
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [89]:
class Options:
    def __init__(self):
        self.model_name = 'alp'

options = Options()
options.output_file = 'test_output/reapos_test_text.txt'
options.params_file = 'params-v1.yaml'
options.use_cuda = False
options.use_validation = True
options.gpu2cpu = True

In [90]:
#LOAD CONFIGURATIONS AND LANGUAGES
USE_CUDA = options.use_cuda
print("Use cuda: %s" %USE_CUDA)

try:
    with open(options.params_file, 'r') as ymlfile:
        config = yaml.load(ymlfile)
except:
    sys.exit("Parameters file missing")

#Setup languages
INPUT_LANG_CODE = config['INPUT_LANG']
OUTPUT_LANG_CODE = config['OUTPUT_LANG']

INPUT_LANG_PUNC_LEVEL = config["INPUT_LANG_PUNC_LEVEL"]
OUTPUT_LANG_PUNC_LEVEL = config["OUTPUT_LANG_PUNC_LEVEL"]

if INPUT_LANG_CODE == 'en' and OUTPUT_LANG_CODE == 'es':
    lang_en = input_lang = Lang(INPUT_LANG_CODE, config["W2V_EN_PATH"], config["DICT_EN_PATH"], punctuation_level=INPUT_LANG_PUNC_LEVEL)
    lang_es = output_lang = Lang(OUTPUT_LANG_CODE, config["W2V_ES_PATH"], config["DICT_ES_PATH"], punctuation_level=OUTPUT_LANG_PUNC_LEVEL)
elif INPUT_LANG_CODE == 'es' and OUTPUT_LANG_CODE == 'en':
    lang_es = input_lang = Lang(INPUT_LANG_CODE, config["W2V_ES_PATH"], config["DICT_ES_PATH"], punctuation_level=INPUT_LANG_PUNC_LEVEL)
    lang_en = output_lang = Lang(OUTPUT_LANG_CODE, config["W2V_EN_PATH"], config["DICT_EN_PATH"], punctuation_level=OUTPUT_LANG_PUNC_LEVEL)

input_prosody_params = config['INPUT_PROSODY']
if input_prosody_params == None:
    input_prosody_params = []
output_prosody_params = config['OUTPUT_PROSODY']
if output_prosody_params == None:
    output_prosody_params = []    
    
#NETWORK CONFIG
max_seq_length = int(config['MAX_SEQ_LENGTH'])
n_prosody_params = int(config['N_PROSODY_PARAMS'])
input_prosody_params = config['INPUT_PROSODY']
encoder_type = config['ENCODER_TYPE']
attn_model = config['ATTN_MODEL']
hidden_size = int(config['HIDDEN_SIZE'])
n_layers = int(config['N_LAYERS'])

2018-09-19 11:35:59,695 : INFO : loading Word2Vec object from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model


Use cuda: False


2018-09-19 11:36:02,160 : INFO : loading wv recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.wv.* with mmap=None
2018-09-19 11:36:02,161 : INFO : loading vectors from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.wv.vectors.npy with mmap=None
2018-09-19 11:36:02,354 : INFO : setting ignored attribute vectors_norm to None
2018-09-19 11:36:02,357 : INFO : loading vocabulary recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.vocabulary.* with mmap=None
2018-09-19 11:36:02,362 : INFO : loading trainables recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.trainables.* with mmap=None
2018-09-19 11:36:02,369 : INFO : loading syn1neg from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.trainables.syn1neg.npy with mmap=None
2018-09-19 11:36:02

en Vocabulary size: 30000


2018-09-19 11:36:04,601 : INFO : loading wv recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.wv.* with mmap=None
2018-09-19 11:36:04,602 : INFO : loading vectors from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.wv.vectors.npy with mmap=None
2018-09-19 11:36:04,902 : INFO : setting ignored attribute vectors_norm to None
2018-09-19 11:36:04,907 : INFO : loading vocabulary recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.vocabulary.* with mmap=None
2018-09-19 11:36:04,914 : INFO : loading trainables recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.trainables.* with mmap=None
2018-09-19 11:36:04,917 : INFO : loading syn1neg from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.trainables.syn1neg.npy with mmap=None
2018-09-19 11:36:05

es Vocabulary size: 30000


In [91]:
#LOAD DATASETS PATHS   
AUDIO_TEST_DATA_PATH = config["AUDIO_TEST_DATA_FILE"]
AUDIO_TRAIN_DATA_PATH = config["AUDIO_TRAIN_DATA_FILE"]
AUDIO_VALIDATION_DATA_PATH = config["AUDIO_VALIDATION_DATA_FILE"]
TEXT_TEST_DATA_PATH = config['TEXT_TEST_DATA_PATH']
AUDIO_ALL_DATA_PATH = "/Users/alp/Movies/heroes/transProse_data/audiodata-v1/transProse_audiodata.txt"

In [55]:
#Initialize text models
text_encoder_path = 'models/5mm_unpuncdinput_encoder.model'
text_decoder_path = 'models/5mm_unpuncdinput_decoder.model'

print('Input punc lvl: ', INPUT_LANG_PUNC_LEVEL)
print('Output punc lvl: ', OUTPUT_LANG_PUNC_LEVEL)

text_encoder = GenericEncoder(input_lang.vocabulary_size, hidden_size, input_lang.get_weights_matrix(), n_layers)
text_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, input_feed=config['DECODER_INPUT_FEED'])
load_model(text_encoder, text_decoder, text_encoder_path, text_decoder_path, options.gpu2cpu)

Input punc lvl:  0
Output punc lvl:  2
gpu2cpu: True


In [82]:
#Initialize prosodic models
AUDIO_ENCODE_ONLY = False
model_name = "audio_continousin"
prosodic_encoder_path = 'models/' + model_name + '_encoder.model'
prosodic_decoder_path = 'models/' + model_name + '_decoder.model'

if encoder_type == 'sum':
    prosodic_encoder = EncoderRNN_sum_ver(input_lang.vocabulary_size, n_prosody_params, hidden_size, input_lang.get_weights_matrix(), n_layers)
elif encoder_type == 'parallel':
    prosodic_encoder = EncoderRNN_parallel(input_lang.vocabulary_size, n_prosody_params, hidden_size, input_lang.get_weights_matrix(), n_layers)
else:
    sys.exit("Unrecognized encoder type. Check params file. Exiting...")
if AUDIO_ENCODE_ONLY:
    prosodic_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, USE_CUDA=USE_CUDA)
else:
    prosodic_decoder = ProsodicDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, USE_CUDA=USE_CUDA)
    
load_model(prosodic_encoder, prosodic_decoder, prosodic_encoder_path, prosodic_decoder_path, gpu_to_cpu=options.gpu2cpu)

gpu2cpu: True


In [15]:
#DATA GENERATORS
#generates data from tab separated file
def text_data_generator(data_path, input_lang, output_lang, stop_at=-1):
    count = 0
    with open(data_path,'r') as inputfile:
        for line in inputfile:
            if not stop_at == -1 and count >= stop_at:
                break
            pair = [sentence.strip() for sentence in line.split('\t')]
            if input_lang.lang_code == 'en':
                in_sentence = pair[0]
                out_sentence = pair[1]
            elif input_lang.lang_code == 'es':
                in_sentence = pair[1]
                out_sentence = pair[0]

            in_sentence_tokens = in_sentence.lower().split()
            out_sentence_tokens = out_sentence.lower().split()

            if input_lang.punctuation_level == 0:
                in_sentence_tokens = remove_punc_tokens(in_sentence_tokens)
            elif input_lang.punctuation_level == 1:
                in_sentence_tokens = remove_punc_tokens(in_sentence_tokens, keep_main_puncs=True)
            if output_lang.punctuation_level == 0:
                out_sentence_tokens = remove_punc_tokens(out_sentence_tokens)
            elif output_lang.punctuation_level == 1:
                out_sentence_tokens = remove_punc_tokens(out_sentence_tokens, keep_main_puncs=True)

            count += 1

            yield in_sentence_tokens, out_sentence_tokens

def audio_data_generator(data_path, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, dummyfy_input_prosody=False, dummyfy_output_prosody=False, stop_at = -1):
    assert not input_lang == output_lang
    audio_data = read_audio_dataset_file(data_path, shuffle=False)

    #start generating samples from the proscript links in the data file
    count = 0
    for segment_data in audio_data:
        if not stop_at == -1 and count >= stop_at:
            break

        es_txt = segment_data[0]
        es_csv = segment_data[1]
        en_txt = segment_data[2]
        en_csv = segment_data[3]
        
        #print(en_csv)
        #print(es_csv)

        if input_lang.lang_code == 'en' and output_lang.lang_code == 'es':
            input_proscript = en_csv
            output_proscript = es_csv
            #input_transcript = read_text_file(en_txt)
            #output_transcript = read_text_file(es_txt)
        elif input_lang.lang_code == 'es' and output_lang.lang_code == 'en':
            input_proscript = es_csv
            output_proscript = en_csv
            #input_transcript = read_text_file(es_txt)
            #output_transcript = read_text_file(en_txt)
            
        if input_lang.punctuation_level == 0:
            input_punc = False
            input_only_main_punc = False
        elif input_lang.punctuation_level == 1:
            input_punc = True
            input_only_main_punc = True
        elif input_lang.punctuation_level == 2:
            input_punc = True
            input_only_main_punc = False
            
        if output_lang.punctuation_level == 0:
            output_punc = False
            output_only_main_punc = False
        elif output_lang.punctuation_level == 1:
            output_punc = True
            output_only_main_punc = True
        elif output_lang.punctuation_level == 2:
            output_punc = True
            output_only_main_punc = False

        in_sentence_tokens, in_prosody_tokens = read_data_from_proscript(input_proscript, input_lang, n_prosody_params, input_prosody_params, punctuation_as_tokens = input_punc, keep_only_main_puncs = input_only_main_punc)
        out_sentence_tokens, out_prosody_tokens = read_data_from_proscript(output_proscript, output_lang, n_prosody_params, output_prosody_params, punctuation_as_tokens = output_punc, keep_only_main_puncs = output_only_main_punc)
    
        if dummyfy_input_prosody:
            in_prosody_tokens = np.zeros_like(in_prosody_tokens)
        if dummyfy_output_prosody:
            out_prosody_tokens = np.zeros_like(out_prosody_tokens)
            
        count += 1
        yield in_sentence_tokens, in_prosody_tokens, out_sentence_tokens, out_prosody_tokens, en_csv, es_csv

In [76]:
#various utilities
def print_prosody(prosody):
    print(np.array(prosody).transpose())
    
def print_tokens_with_pause(tokens, pausevals = [], pauseflags=[]):
    if pauseflags == [] and not pausevals == []:
        pauseflags = flags_from_value(pausevals)
        
    to_print = ""
    for i, token in enumerate(tokens):
        if not token == "END":
            to_print += token + " "
            if not pauseflags == [] and pauseflags[i]:
                to_print += "[P]"
                if not pausevals == []:
                    to_print += "<%0.3f>"%pausevals[i]
                to_print += " "
            
    #print(to_print)
    return to_print
            
def flags_from_value(prosody_seq):
    return [1 if not feature_value == 0.0 else 0 for feature_value in prosody_seq]

def show_attention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

In [None]:
#SEE TEXT DATA
for in_sent, out_sent in text_data_generator(TEXT_TEST_DATA_PATH, input_lang, output_lang):
    print(in_sent)
    print(out_sent)
    exit = input('...')
    if exit == 'q':
        break

In [137]:
#SEE AUDIO DATA
in_pros = []
for in_sent, in_pros, out_sent, out_pros, in_csv, out_csv in audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, dummyfy_input_prosody=False, dummyfy_output_prosody=False):
    print(in_sent)
    print(indexes_from_tokens(input_lang, in_sent))
    in_pros = finalize_prosody_sequence(in_pros)
    print_prosody(in_pros)
    print(out_sent)
    print_prosody(out_pros)
    exit = input('...')
    if exit == 'q':
        break

['i', 'could', 'move', 'objects', 'with', 'my', 'mind', '.', 'i', 'could', '...']
[3, 102, 317, 5067, 35, 25, 277, 0, 3, 102, 18, 29998]
[[0.   0.   0.   0.   0.   0.   0.13 0.   0.   0.   0.   0.  ]]
['podía', 'mover', 'objetos', 'con', 'mi', 'mente', ',', 'podía', '...']
[[0.   0.   0.   0.   0.   0.15 0.   0.   0.  ]]
...
['what', 'makes', 'you', 'so', 'sure', 'it', "'s", 'a', 'boy', '?']
[23, 411, 2, 45, 154, 10, 9, 8, 279, 5, 29998]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
['¿', 'y', 'por', 'qué', 'crees', 'que', 'es', 'un', 'chico', '?']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
...Q
['you', 'want', 'payback', ',', 'i', 'understand', 'that', ',', 'but', 'not', 'all', 'of', 'these', 'individuals', 'out', 'there', 'are', 'dangerous', '.', 'some', 'of', 'them', 'are', 'just', 'plain', 'scared', '.', 'and', 'now', 'that', 'they', 'know', 'they', "'re", 'being', 'pursued', ',', 'they', "'re", 'going', 'to', 'be', 'even', 'that', 'much', 'harder', 'to', 'catch', '.']
[2, 68, 8036, 1, 3, 276, 12, 1

In [9]:
#EVALUATORS
#text -> text
def evaluate_text(input_seq_tokens, input_lang, output_lang, encoder, decoder, max_length, USE_CUDA=False):
    input_word_seqs = [indexes_from_tokens(input_lang, input_seq_tokens)]
    
    #make sure sequences are below max_length. 
    input_word_seqs = limit_seqs_to_max(input_word_seqs, max_length)

    input_lengths = [len(input_word_seqs[0])]
    input_word_batch = Variable(torch.LongTensor(input_word_seqs)).transpose(0, 1)

    if USE_CUDA:
        input_word_batch = input_batch.cuda()
        input_prosody_batch = input_prosody_batch.cuda()

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)

    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_word_batch, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([output_lang.token2index(SWT_TOKEN)])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))

    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    # Store output words and attention states
    decoded_words = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder(
                decoder_input, decoder_context, decoder_hidden, encoder_outputs)
        
        decoder_attentions[di,:decoder_attn.size(2)] += decoder_attn.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        #ni = topi[0][0]  #old code
        ni = topi.item()
        if ni == output_lang.token2index(EOS_TOKEN):
            decoded_words.append(EOS_TOKEN)
            break
        else:
            decoded_words.append(output_lang.index2token(ni))

        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

#text+audio -> text+audio
#use prosodic encoder/decoder
def evaluate_audio(input_seq_tokens, input_prosody_tokens, input_lang, output_lang, encoder, decoder, max_length, audio_encode_only, USE_CUDA=False):
    input_word_seqs = [indexes_from_tokens(input_lang, input_seq_tokens)]
    input_prosody_seqs = [finalize_prosody_sequence(input_prosody_tokens)] #put the end token
    
    #make sure sequences are below max_length. 
    input_word_seqs = limit_seqs_to_max(input_word_seqs, max_length)
    input_prosody_seqs = limit_seqs_to_max(input_prosody_seqs, max_length)

    input_lengths = [len(input_word_seqs[0])]
    input_word_batch = Variable(torch.LongTensor(input_word_seqs)).transpose(0, 1)
    input_prosody_batch = Variable(torch.FloatTensor(input_prosody_seqs)).transpose(0, 1)
    
    if USE_CUDA:
        input_word_batch = input_batch.cuda()
        input_prosody_batch = input_prosody_batch.cuda()

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)

    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_word_batch, input_prosody_batch, input_lengths, None)

    # Create starting vectors for decoder
    decoder_word_input = Variable(torch.LongTensor([output_lang.token2index(SWT_TOKEN)])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))

    if USE_CUDA:
        decoder_word_input = decoder_word_input.cuda()
        decoder_context = decoder_context.cuda()

    # Store output words and attention states
    decoded_word_seq = []
    decoded_pauseflag_seq = []
    decoded_pausevalue_seq = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    # Run through decoder
    decoder_stop = False
    for di in range(max_length):
        if audio_encode_only:
            decoder_output_word, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_word_input, decoder_context, decoder_hidden, encoder_outputs)
        else:
#             decoder_output_word, decoder_output_pauseflag, decoder_output_pausevalue, decoder_context, decoder_hidden, decoder_attention = decoder(
#                 decoder_word_input, decoder_context, decoder_hidden, encoder_outputs
#             )  #FLAGTEST
            
            decoder_output_word, decoder_output_pauseflag, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_word_input, decoder_context, decoder_hidden, encoder_outputs
            )
        
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
            
        # Choose top word from output
        topv, topi = decoder_output_word.data.topk(1)
        ni_word = topi.item()
        if ni_word == output_lang.token2index(EOS_TOKEN):
            decoded_word_seq.append(EOS_TOKEN)
            decoder_stop = True
        else:
            decoded_word_seq.append(output_lang.index2token(ni_word))
        
        if not audio_encode_only:
            # Look at pauseflag output
            topv, topi = decoder_output_pauseflag.data.topk(1)
            ni_pauseflag = topi.item()
            decoded_pauseflag_seq.append(ni_pauseflag)

            # Look at pauseval output
            #predicted_pausevalue = decoder_output_pausevalue.item()
            #decoded_pausevalue_seq.append(unnormalize_value(predicted_pausevalue, 0.0, 10.0))
            #decoded_pausevalue_seq.append(predicted_pausevalue)

        # Next input is chosen word
        if decoder_stop:
            break
        else:
            decoder_word_input = Variable(torch.LongTensor([ni_word]))
            if USE_CUDA: decoder_word_input = decoder_word_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    #return decoded_word_seq, decoded_pauseflag_seq, decoded_pausevalue_seq, decoder_attentions[:di+1, :len(encoder_outputs)]  #FLAGTEST
    return decoded_word_seq, decoded_pauseflag_seq, decoder_attentions[:di+1, :len(encoder_outputs)]  #FLAGTEST

In [None]:
#Text -> text evaluator test
input_sentence = "he 's flying through that glass ?" 
input_seq_tokens = input_sentence.split()
decoded_words, attentions = evaluate_text(input_seq_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
print(readable_from_tokens(decoded_words[:-1]))
show_attention(input_sentence, decoded_words, attentions)
#print(attentions)

In [None]:
#show_attention(readable_from_tokens(in_sentence_tokens), translation_tokens, attentions)
print(attentions.shape)
print(attentions[0])
print(np.argmax(attentions[0]).item())

In [83]:
gen = audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params)

In [84]:
#Text+prosody -> text+prosody on audio data visualization
print_every = 0
print_count = 0
print_strings_for_prosody_evaluation = []
for in_sentence_tokens, in_prosody_tokens, out_sentence_tokens, out_prosody_tokens, in_csv, out_csv in gen:
    #print(in_csv)
    #print(out_csv)
    
    in_string = print_tokens_with_pause(in_sentence_tokens, in_prosody_tokens[:,0])
    gt_string = print_tokens_with_pause(out_sentence_tokens, out_prosody_tokens[:,0])
    
    #translation_tokens, pauseflag_tokens, pausevalue_tokens, _ = evaluate_audio(in_sentence_tokens, in_prosody_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length, audio_encode_only=AUDIO_ENCODE_ONLY)
    translation_tokens, pauseflag_tokens, attentions = evaluate_audio(in_sentence_tokens, in_prosody_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length, audio_encode_only=AUDIO_ENCODE_ONLY)
    prosody_translation_string = print_tokens_with_pause(translation_tokens, pauseflags=pauseflag_tokens)
    
    translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
    text_translation_string = print_tokens_with_pause(translation_tokens)
    
    print_string = '%s\t%s\t%s'%(in_string,gt_string,prosody_translation_string)
    print_strings_for_prosody_evaluation.append(print_string)
    
    print_count += 1
    if print_every > 0 and  print_count % print_every == 0:
        print("IN")
        print(in_string)
        print("GT")
        print(gt_string)
        print("OUT PROSODY")
        print(prosody_translation_string)
        print("OUT TEXT")
        print(text_translation_string)
        
        inp = input("...")
        if inp == 'q':
            break
        print("======================================================================")
    
with open('prosody_eval.txt', 'w') as f:
    for eval_str in print_strings_for_prosody_evaluation:
        f.write(eval_str+'\n')



In [None]:
#Text -> text on text data
def text_set_translation_generator(stop_at = -1, report=False):
    for in_sentence_tokens, out_sentence_tokens in text_data_generator(TEXT_TEST_DATA_PATH, input_lang, output_lang, stop_at):
        translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
        if report:
            print("> %s"%(readable_from_tokens(in_sentence_tokens)))
            print("= %s"%(readable_from_tokens(out_sentence_tokens)))
            print("< %s"%readable_from_tokens(translation_tokens[:-1]))
            print("---")
        
        yield [out_sentence_tokens], translation_tokens[:-1]
        
testing_set_bleu, sentence_count = compute_bleu(text_set_translation_generator(stop_at=100, report=True), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", testing_set_bleu)

In [None]:
#Text -> text on audio data
gold_sentence_tokens = []
predicted_sentence_tokens = []
def audio_set_text_translation_generator(evaluation_set, stop_at = -1, report=False, model='text'):
    for in_sentence_tokens, in_prosody_tokens , out_sentence_tokens, out_prosody_tokens, in_csv, out_csv in audio_data_generator(evaluation_set, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, dummyfy_input_prosody=True, dummyfy_output_prosody=True, stop_at=stop_at):
        if model == 'text':
            translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
        elif model == 'audio':
            translation_tokens, _, _ = evaluate_audio(in_sentence_tokens, in_prosody_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length, AUDIO_ENCODE_ONLY)
        if report:
            print("> %s"%(readable_from_tokens(in_sentence_tokens)))
            print("= %s"%(readable_from_tokens(out_sentence_tokens)))
            print("< %s"%readable_from_tokens(translation_tokens[:-1]))
            print("---")
            
        gold_sentence_tokens.append(out_sentence_tokens)
        predicted_sentence_tokens.append(translation_tokens[:-1])
        yield [out_sentence_tokens], translation_tokens[:-1]
             
evaluation_set = AUDIO_TEST_DATA_PATH
testing_set_bleu, sentence_count = compute_bleu(
    audio_set_text_translation_generator(evaluation_set, report=False, stop_at=-1, model='text'), 
    max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", testing_set_bleu)

In [None]:
#BLEU calculation on OpenNMT results
predictions_file = "/Users/alp/phdCloud/playground/OpenNMT-py/heroes_test_v2-pred.txt"
with open(predictions_file) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
openNMT_predictions = [x.strip() for x in content]

def openNMT_translation_generator():
    for gold, pred in zip(gold_sentence_tokens, openNMT_predictions):
        print([gold])
        print(pred.split(" "))
        yield [gold], pred.split(" ")
        
openNMT_bleu, sentence_count = compute_bleu(openNMT_translation_generator(), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", openNMT_bleu)

In [102]:
#Create text translation data from compiled heroes data
output_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/es_goldpuncd.txt"

input_lang.punctuation_level = 2
output_lang.punctuation_level = 2

with open(output_file, 'w') as f:
    for in_sentence_tokens, _ , out_sentence_tokens, _, _, _ in audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params):
        #to write tab separated en-es
        #f.write("%s\t%s\n"%(readable_from_tokens(in_sentence_tokens), readable_from_tokens(out_sentence_tokens)))
        #to write only en
        #f.write("%s\n"%(readable_from_tokens(in_sentence_tokens)))
        #to write only es
        f.write("%s\n"%(readable_from_tokens(out_sentence_tokens)))

In [144]:
#BLEU calculation from already translated text files
gold_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/es_goldpuncd.txt"
predictions_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/translated/en_punkProsed_ted_w_transProsed_5mmheroes_unpuncdinput_v1testset.txt"

def textfile_translation_generator(gold_file, predictions_file):
    gold_sentences = []
    predicted_sentences = []
    with open(gold_file, 'r') as f:
        for line in f:
            gold_sentences.append(line.strip().split(" "))
    with open(predictions_file, 'r') as f:
        for line in f:
            predicted_sentences.append(line.strip().split(" "))
    for gold, pred in zip(gold_sentences, predicted_sentences):       
        yield [gold], pred
        
bleu, sentence_count = compute_bleu(textfile_translation_generator(gold_file, predictions_file), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", bleu)

Evaluated 542 samples.
BLEU:  0.16735539889325204


In [138]:
#Initialize text models COPY
text_encoder_path = 'models/5mmheroes_puncdinput_encoder.model'
text_decoder_path = 'models/5mmheroes_puncdinput_decoder.model'

text_encoder = GenericEncoder(input_lang.vocabulary_size, hidden_size, input_lang.get_weights_matrix(), n_layers)
text_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, input_feed=config['DECODER_INPUT_FEED'])
load_model(text_encoder, text_decoder, text_encoder_path, text_decoder_path, options.gpu2cpu)

gpu2cpu: True


In [143]:
#Transprosing text files
input_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/en_punkProsed_ted_w.txt"
gold_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/es_goldpuncd.txt"
output_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/translated/en_punkProsed_ted_w_transProsed_5mmheroes_unpuncdinput_v1testset.txt"

stop_at = -1
report = False

gold_sentences = []
predicted_sentences = []
input_sentences = []

#read files
with open(gold_file, 'r') as f:
    for line in f:
        gold_sentences.append(line.strip().split(" "))
with open(input_file, 'r') as f:
    for line in f:
        input_sentences.append(line.strip().split(" "))
#translate
count = 0
for in_sentence_tokens, gold_tokens in zip(input_sentences, gold_sentences):
    translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
    predicted_sentences.append(translation_tokens)
    if report:
        print("> %s"%(readable_from_tokens(in_sentence_tokens)))
        print("= %s"%(readable_from_tokens(gold_tokens)))
        print("< %s"%readable_from_tokens(translation_tokens[:-1]))
        print("---")
    #yield [gold_tokens], translation_tokens

    count += 1
    if count == stop_at:
        break
    if count % 100 == 0:
        print(count)
            
#store translations in a text file
with open(output_file, 'w') as f:
    for token_index, script_tokens in enumerate(predicted_sentences):
        f.write("%s\n" % ' '.join(script_tokens[:-1]))

100
200


Unknown (en): annapura


300
400


Unknown (en): deﬁne
Unknown (en): andos


500
