In [1]:
import string
import re
import os
import sys
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

from utils import *
from models import *
import yaml
import numpy as np

In [2]:
class Options:
    def __init__(self):
        self.model_name = 'alp'

options = Options()
options.output_file = 'test_output/reapos_test_text.txt'
options.params_file = 'params.yaml'
options.use_cuda = False
options.use_validation = True
options.gpu2cpu = True

In [3]:
#LOAD CONFIGURATIONS AND LANGUAGES
USE_CUDA = options.use_cuda
print("Use cuda: %s" %USE_CUDA)

try:
    with open(options.params_file, 'r') as ymlfile:
        config = yaml.load(ymlfile)
except:
    sys.exit("Parameters file missing")

#Setup languages
INPUT_LANG_CODE = config['INPUT_LANG']
OUTPUT_LANG_CODE = config['OUTPUT_LANG']

if INPUT_LANG_CODE == 'en' and OUTPUT_LANG_CODE == 'es':
    lang_en = input_lang = Lang(INPUT_LANG_CODE, config["W2V_EN_PATH"], config["DICT_EN_PATH"], omit_punctuation=config["INPUT_LANG_OMIT_PUNC"])
    lang_es = output_lang = Lang(OUTPUT_LANG_CODE, config["W2V_ES_PATH"], config["DICT_ES_PATH"], omit_punctuation=config["OUTPUT_LANG_OMIT_PUNC"])
elif INPUT_LANG_CODE == 'es' and OUTPUT_LANG_CODE == 'en':
    lang_es = input_lang = Lang(INPUT_LANG_CODE, config["W2V_ES_PATH"], config["DICT_ES_PATH"], omit_punctuation=config["INPUT_LANG_OMIT_PUNC"])
    lang_en = output_lang = Lang(OUTPUT_LANG_CODE, config["W2V_EN_PATH"], config["DICT_EN_PATH"], omit_punctuation=config["OUTPUT_LANG_OMIT_PUNC"])

input_prosody_params = config['INPUT_PROSODY']
if input_prosody_params == None:
    input_prosody_params = []
output_prosody_params = config['OUTPUT_PROSODY']
if output_prosody_params == None:
    output_prosody_params = []    
    
#NETWORK CONFIG
max_seq_length = int(config['MAX_SEQ_LENGTH'])
n_prosody_params = int(config['N_PROSODY_PARAMS'])
input_prosody_params = config['INPUT_PROSODY']
encoder_type = config['ENCODER_TYPE']
attn_model = config['ATTN_MODEL']
hidden_size = int(config['HIDDEN_SIZE'])
n_layers = int(config['N_LAYERS'])

2018-07-18 12:00:37,725 : INFO : loading Word2Vec object from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model


Use cuda: False


2018-07-18 12:00:38,639 : INFO : loading wv recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.wv.* with mmap=None
2018-07-18 12:00:38,643 : INFO : loading vectors from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.wv.vectors.npy with mmap=None
2018-07-18 12:00:38,766 : INFO : setting ignored attribute vectors_norm to None
2018-07-18 12:00:38,771 : INFO : loading vocabulary recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.vocabulary.* with mmap=None
2018-07-18 12:00:38,776 : INFO : loading trainables recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.trainables.* with mmap=None
2018-07-18 12:00:38,780 : INFO : loading syn1neg from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_en_heroes.model.trainables.syn1neg.npy with mmap=None
2018-07-18 12:00:38

en Vocabulary size: 30000


2018-07-18 12:00:40,811 : INFO : loading wv recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.wv.* with mmap=None
2018-07-18 12:00:40,812 : INFO : loading vectors from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.wv.vectors.npy with mmap=None
2018-07-18 12:00:41,044 : INFO : setting ignored attribute vectors_norm to None
2018-07-18 12:00:41,048 : INFO : loading vocabulary recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.vocabulary.* with mmap=None
2018-07-18 12:00:41,049 : INFO : loading trainables recursively from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.trainables.* with mmap=None
2018-07-18 12:00:41,050 : INFO : loading syn1neg from /Users/alp/Documents/Corpora/OpenSubtitles2018/w2v/reapos_min5_heroes/w2v_es_heroes.model.trainables.syn1neg.npy with mmap=None
2018-07-18 12:00:41

es Vocabulary size: 30000


In [4]:
#LOAD DATASETS    
#!!!TESTING ON TRAINING SET!!!
AUDIO_TEST_DATA_PATH = config["AUDIO_TRAIN_DATA_FILE"]
TEXT_TEST_DATA_PATH = config['TEXT_TEST_DATA_PATH']

In [5]:
#Initialize text models
text_encoder_path = 'models/reapos_encoder.model'
text_decoder_path = 'models/reapos_decoder.model'

text_encoder = GenericEncoder(input_lang.vocabulary_size, hidden_size, input_lang.get_weights_matrix(), n_layers)
text_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers)
load_model(text_encoder, text_decoder, text_encoder_path, text_decoder_path, options.gpu2cpu)

gpu2cpu: True


In [6]:
#Initialize prosodic models
# prosodic_encoder_path = 'models/audio1test_encoder.model'
# prosodic_decoder_path = 'models/audio1test_decoder.model'

prosodic_encoder_path = 'models/simpleloss_encoder.model'
prosodic_decoder_path = 'models/simpleloss_decoder.model'

if encoder_type == 'sum':
    prosodic_encoder = EncoderRNN_sum(input_lang.vocabulary_size, n_prosody_params, hidden_size, input_lang.get_weights_matrix(), n_layers)
elif encoder_type == 'parallel':
    prosodic_encoder = EncoderRNN_parallel(input_lang.vocabulary_size, n_prosody_params, hidden_size, input_lang.get_weights_matrix(), n_layers)
else:
    sys.exit("Unrecognized encoder type. Check params file. Exiting...")
prosodic_decoder = ProsodicDecoderRNN(attn_model, hidden_size, output_lang.vocabulary_size, n_layers)

load_model(prosodic_encoder, prosodic_decoder, prosodic_encoder_path, prosodic_decoder_path, gpu_to_cpu=options.gpu2cpu)

gpu2cpu: True


In [7]:
#DATA GENERATORS
def text_data_generator(data_path, input_lang, output_lang, stop_at=-1):
    count = 0
    with open(data_path,'r') as inputfile:
        for line in inputfile:
            if not stop_at == -1 and count >= stop_at:
                break
            pair = [sentence.strip() for sentence in line.split('\t')]
            if input_lang.lang_code == 'en':
                in_sentence = pair[0]
                out_sentence = pair[1]
            elif input_lang.lang_code == 'es':
                in_sentence = pair[1]
                out_sentence = pair[0]

            in_sentence_tokens = in_sentence.lower().split()
            out_sentence_tokens = out_sentence.lower().split()

            if input_lang.omit_punctuation:
                in_sentence_tokens = remove_punc_tokens(in_sentence_tokens)
            if output_lang.omit_punctuation:
                out_sentence_tokens = remove_punc_tokens(out_sentence_tokens)

            count += 1

            yield in_sentence_tokens, out_sentence_tokens
            

def audio_data_generator(data_path, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, stop_at = -1):
    assert not input_lang == output_lang
    audio_data = read_audio_dataset_file(data_path, shuffle=False)

    #start generating samples from the proscript links in the data file
    count = 0
    for segment_data in audio_data:
        if not stop_at == -1 and count >= stop_at:
            break

        es_txt = segment_data[0]
        es_csv = segment_data[1]
        en_txt = segment_data[2]
        en_csv = segment_data[3]

        if input_lang.lang_code == 'en' and output_lang.lang_code == 'es':
            input_proscript = en_csv
            output_proscript = es_csv
            #input_transcript = read_text_file(en_txt)
            #output_transcript = read_text_file(es_txt)
        elif input_lang.lang_code == 'es' and output_lang.lang_code == 'en':
            input_proscript = es_csv
            output_proscript = en_csv
            #input_transcript = read_text_file(es_txt)
            #output_transcript = read_text_file(en_txt)

        in_sentence_tokens, in_prosody_tokens = read_data_from_proscript(input_proscript, input_lang, n_prosody_params, input_prosody_params, punctuation_as_tokens = not input_lang.omit_punctuation)
        out_sentence_tokens, out_prosody_tokens = read_data_from_proscript(output_proscript, output_lang, n_prosody_params, output_prosody_params, punctuation_as_tokens = not output_lang.omit_punctuation)
    
        yield in_sentence_tokens, in_prosody_tokens, out_sentence_tokens, out_prosody_tokens

In [8]:
#various utilities
def print_prosody(prosody):
    print(np.array(prosody).transpose())
    
def print_tokens_with_pause(tokens, pausevals = [], pauseflags=[]):
    if pauseflags == [] and not pausevals == []:
        pauseflags = flags_from_value(pausevals)
        
    to_print = ""
    for i, token in enumerate(tokens):
        to_print += token + " "
        if not pausevals == [] and pauseflags[i]:
            to_print += "[" + "{:.2f}".format(pausevals[i]) + "]"
            to_print += " "
            
    print(to_print)
    return to_print
            
def flags_from_value(prosody_seq):
    return [1 if not feature_value == 0.0 else 0 for feature_value in prosody_seq]

In [None]:
#SEE TEXT DATA
for in_sent, out_sent in text_data_generator(TEXT_TEST_DATA_PATH, input_lang, output_lang):
    print(in_sent)
    print(out_sent)
    exit = input('...')
    if exit == 'q':
        break

In [9]:
#SEE AUDIO DATA
for in_sent, in_pros, out_sent, out_pros in audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params):
    print(in_sent)
    print(indexes_from_tokens(input_lang, in_sent))
    in_pros = finalize_prosody_sequence(in_pros)
    print_prosody(in_pros)
    print(out_sent)
    print_prosody(out_pros)
    exit = input('...')
    if exit == 'q':
        break

['my', 'friend', 'has', 'a', 'new', 'trick']
[25, 263, 114, 8, 177, 1525, 29998]
[[ 0.      0.      0.      0.      0.      0.      0.    ]
 [-2.0687  3.3615  3.0931  1.9758  4.5204  0.      0.    ]
 [-2.6281  0.4716  1.8355  1.5712  1.0302  0.      0.    ]]
['tiene', 'una', 'nueva', 'habilidad', '.']
[[ 0.      0.      0.      0.      0.    ]
 [ 4.3959  5.1889 -1.7019 -2.4889 -2.4889]
 [ 1.1604  0.9001  0.6358 -1.0409 -1.0409]]
...Q
['now', 'you', 'can', 'afford', 'the', '65', 'dollars', 'i', 'asked', 'you', 'for']
[62, 2, 37, 1582, 4, 6780, 1269, 3, 414, 2, 24, 29998]
[[ 0.      0.05    0.      0.      0.      0.      0.      0.      0.
   0.06    0.      0.    ]
 [ 2.7948 -0.1108  1.8794  3.5223  2.8689  2.8689 -1.1989  0.1508 -7.7112
  -0.2875 -2.4603  0.    ]
 [-0.0529  1.6238  0.238   0.238   1.8881 -1.586   0.8057  1.0827 -0.3489
  -2.9184 -2.9184  0.    ]]
['así', 'podrás', 'darme', 'los', '65', 'dólares', 'que', 'te', 'he', 'pedido', '.']
[[ 0.03    0.      0.      0.      0. 

In [10]:
#EVALUATORS
#text -> text
def evaluate_text(input_seq_tokens, input_lang, output_lang, encoder, decoder, max_length, USE_CUDA=False):
    input_word_seqs = [indexes_from_tokens(input_lang, input_seq_tokens)]
    
    #make sure sequences are below max_length. 
    input_word_seqs = limit_seqs_to_max(input_word_seqs, max_length)

    input_lengths = [len(input_word_seqs[0])]
    input_word_batch = Variable(torch.LongTensor(input_word_seqs)).transpose(0, 1)

    if USE_CUDA:
        input_word_batch = input_batch.cuda()
        input_prosody_batch = input_prosody_batch.cuda()

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)

    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_word_batch, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([output_lang.token2index(SWT_TOKEN)])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    # Store output words and attention states
    decoded_words = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        #ni = topi[0][0]  #old code
        ni = topi.item()
        if ni == output_lang.token2index(EOS_TOKEN):
            decoded_words.append(EOS_TOKEN)
            break
        else:
            decoded_words.append(output_lang.index2token(ni))

        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

#text+audio -> text+audio
#use prosodic encoder/decoder
def evaluate_audio(input_seq_tokens, input_prosody_tokens, input_lang, output_lang, encoder, decoder, max_length, USE_CUDA=False):
    input_word_seqs = [indexes_from_tokens(input_lang, input_seq_tokens)]
    input_prosody_seqs = [finalize_prosody_sequence(input_prosody_tokens)] #put the end token
    
    #make sure sequences are below max_length. 
    input_word_seqs = limit_seqs_to_max(input_word_seqs, max_length)
    input_prosody_seqs = limit_seqs_to_max(input_prosody_seqs, max_length)

    input_lengths = [len(input_word_seqs[0])]
    input_word_batch = Variable(torch.LongTensor(input_word_seqs)).transpose(0, 1)
    input_prosody_batch = Variable(torch.FloatTensor(input_prosody_seqs)).transpose(0, 1)
    
    if USE_CUDA:
        input_word_batch = input_batch.cuda()
        input_prosody_batch = input_prosody_batch.cuda()

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)

    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_word_batch, input_prosody_batch, input_lengths, None)

    # Create starting vectors for decoder
    decoder_word_input = Variable(torch.LongTensor([output_lang.token2index(SWT_TOKEN)])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    if USE_CUDA:
        decoder_word_input = decoder_word_input.cuda()

    # Store output words and attention states
    decoded_word_seq = []
    decoded_pauseflag_seq = []
    decoded_pausevalue_seq = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    # Run through decoder
    decoder_stop = False
    for di in range(max_length):
        decoder_output_word, decoder_output_pauseflag, decoder_output_pausevalue, decoder_hidden, decoder_attention = decoder(
            decoder_word_input, decoder_hidden, encoder_outputs
        )
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
            
        # Choose top word from output
        topv, topi = decoder_output_word.data.topk(1)
        ni_word = topi.item()
        if ni_word == output_lang.token2index(EOS_TOKEN):
            decoded_word_seq.append(EOS_TOKEN)
            decoder_stop = True
        else:
            decoded_word_seq.append(output_lang.index2token(ni_word))
            
        # Look at pauseflag output
        topv, topi = decoder_output_pauseflag.data.topk(1)
        ni_pauseflag = topi.item()
        decoded_pauseflag_seq.append(ni_pauseflag)
        
        # Look at pauseval output
        predicted_pausevalue = decoder_output_pausevalue.item()
        decoded_pausevalue_seq.append(predicted_pausevalue)

        # Next input is chosen word
        if decoder_stop:
            break
        else:
            decoder_word_input = Variable(torch.LongTensor([ni_word]))
            if USE_CUDA: decoder_word_input = decoder_word_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    return decoded_word_seq, decoded_pauseflag_seq, decoded_pausevalue_seq, decoder_attentions[:di+1, :len(encoder_outputs)]

In [11]:
#text+audio -> text+audio
#use prosodic encoder/decoder with output forcing
def evaluate_audio_force(input_seq_tokens, input_prosody_tokens, out_gold_sentence_tokens, input_lang, output_lang, encoder, decoder, max_length, USE_CUDA=False):
    input_word_seqs = [indexes_from_tokens(input_lang, input_seq_tokens)]
    output_word_seqs = [indexes_from_tokens(output_lang, out_gold_sentence_tokens)]
    input_prosody_seqs = [finalize_prosody_sequence(input_prosody_tokens)] #put the end token
        
    #make sure sequences are below max_length. 
    input_word_seqs = limit_seqs_to_max(input_word_seqs, max_length)
    output_word_seqs = limit_seqs_to_max(output_word_seqs, max_length)
    input_prosody_seqs = limit_seqs_to_max(input_prosody_seqs, max_length)

    input_lengths = [len(input_word_seqs[0])]
    input_word_batch = Variable(torch.LongTensor(input_word_seqs)).transpose(0, 1)
    input_prosody_batch = Variable(torch.FloatTensor(input_prosody_seqs)).transpose(0, 1)
    output_word_batch = Variable(torch.LongTensor(output_word_seqs)).transpose(0, 1)
    
    output_length = len(output_word_seqs[0])
    
    if USE_CUDA:
        input_word_batch = input_batch.cuda()
        input_prosody_batch = input_prosody_batch.cuda()
        output_word_batch = output_word_batch.cuda()

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)

    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_word_batch, input_prosody_batch, input_lengths, None)

    # Create starting vectors for decoder
    decoder_word_input = Variable(torch.LongTensor([output_lang.token2index(SWT_TOKEN)])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    if USE_CUDA:
        decoder_word_input = decoder_word_input.cuda()

    # Store output words and attention states
    decoded_word_seq = []
    decoded_pauseflag_seq = []
    decoded_pausevalue_seq = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    # Run through decoder
    for t in range(output_length):
        decoder_output_word, decoder_output_pauseflag, decoder_output_pausevalue, decoder_hidden, decoder_attention = decoder(
            decoder_word_input, decoder_hidden, encoder_outputs
        )
        decoder_attentions[t,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
            
        # Choose top word from output
        topv, topi = decoder_output_word.data.topk(1)
        ni_word = topi.item()
        decoded_word_seq.append(output_lang.index2token(ni_word))
            
        # Look at pauseflag output
        topv, topi = decoder_output_pauseflag.data.topk(1)
        ni_pauseflag = topi.item()
        decoded_pauseflag_seq.append(ni_pauseflag)
        
        # Look at pauseval output
        predicted_pausevalue = decoder_output_pausevalue.item()
        decoded_pausevalue_seq.append(predicted_pausevalue)

        # Next input is chosen word
        decoder_word_input = output_word_batch[t]
        if USE_CUDA: decoder_word_input = decoder_word_input.cuda()
            

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    return decoded_word_seq, decoded_pauseflag_seq, decoded_pausevalue_seq, decoder_attentions[:t+1, :len(encoder_outputs)]

In [None]:
#Text -> text evaluator test
input_sentence = "oh this is a terrible disaster this shouldn 't have never happened"
input_seq_tokens = input_sentence.split()
decoded_words, _ = evaluate_text(input_seq_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
print(decoded_words)

In [12]:
gen = audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params)

In [13]:
#Text+prosody -> text+prosody on audio data
print_every = 10
print_count = 0
for in_sentence_tokens, in_prosody_tokens , out_sentence_tokens, out_prosody_tokens in gen:
    print_count += 1
    print("IN")
    print_tokens_with_pause(in_sentence_tokens, in_prosody_tokens[:,0])
    print("GT")
    print_tokens_with_pause(out_sentence_tokens, out_prosody_tokens[:,0])
    
    print("OUT PROSODY")
    translation_tokens, pauseflag_tokens, pausevalue_tokens, _ = evaluate_audio(in_sentence_tokens, in_prosody_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length)
    print_tokens_with_pause(translation_tokens, pausevalue_tokens, pauseflag_tokens)
    print("OUT FORCE")
    translation_tokens, pauseflag_tokens, pausevalue_tokens, _ = evaluate_audio_force(in_sentence_tokens, in_prosody_tokens, out_sentence_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length)
    print_tokens_with_pause(translation_tokens, pausevalue_tokens, pauseflag_tokens)
    print("OUT TEXT")
    translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
    print_tokens_with_pause(translation_tokens)
    
    if print_count % print_every == 0:
        inp = input("...")
        if inp == 'q':
            break
    print("======================================================================")


IN
my friend has a new trick 
GT
tiene una nueva habilidad . 
OUT PROSODY
mi amigo es un truco de nuevo . END 
OUT FORCE
mi un compañera amigo . END 
OUT TEXT
mi amigo tiene un nuevo truco . END 
IN
now you [0.05] can afford the 65 dollars i asked you [0.06] for 
GT
así [0.03] podrás darme los 65 dólares que te he pedido . 
OUT PROSODY




¿ no puedes dar a 100 dólares ? te pedí que te pedí . END 
OUT FORCE
¿ se dar el 65 dólares . te pedí pedido . END 
OUT TEXT
a las costumbres , te lo pedí por la borda . END 
IN
day number five without the meds 
GT
cinco días sin medicinas . 
OUT PROSODY
el número 5 sin los medicación . END 
OUT FORCE
el días sin los . END 
OUT TEXT
el día , el número 5 sin la medicación . END 
IN
he was my friend too [0.03] 400 years ago 
GT
también lo era mío [0.31] hace 400 años . 
OUT PROSODY
era mi amigo , un [0.23] amigo de uno . END 
OUT FORCE
era era era , , mucho años . END 
OUT TEXT
era mi amigo , hace unos años , por aquí . END 
IN
she 's from america 
GT
es de américa . 
OUT PROSODY
es de el estados unidos . END 
OUT FORCE
es de el . END 
OUT TEXT
es de américa . END 
IN
no no not that thing [0.45] the other thing i 'll let you know 
GT
no , no . eso no [0.39] . la otra cosa [0.04] . ya te [0.03] cuento . 
OUT PROSODY
no , no , no , no , no , no , no , no , no , no , no , no , no , no , no 

In [None]:
#Text -> text on text data
def text_set_translation_generator(stop_at = -1, report=False):
    for in_sentence_tokens, out_sentence_tokens in text_data_generator(TEXT_TEST_DATA_PATH, input_lang, output_lang, stop_at):
        translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
        if report:
            print("> %s"%(readable_from_tokens(in_sentence_tokens)))
            print("= %s"%(readable_from_tokens(out_sentence_tokens)))
            print("< %s"%readable_from_tokens(translation_tokens[:-1]))
            print("---")
        
        yield [out_sentence_tokens], translation_tokens[:-1]
        
testing_set_bleu, sentence_count = compute_bleu(text_set_translation_generator(stop_at=100, report=True), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", testing_set_bleu)

In [16]:
#Text -> text on audio data
def audio_set_text_translation_generator(stop_at = -1, report=False):
    for in_sentence_tokens, _ , out_sentence_tokens, _ in audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params):
        translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
        if report:
            print("> %s"%(readable_from_tokens(in_sentence_tokens)))
            print("= %s"%(readable_from_tokens(out_sentence_tokens)))
            print("< %s"%readable_from_tokens(translation_tokens[:-1]))
            print("---")
        
        yield [out_sentence_tokens], translation_tokens[:-1]
        
testing_set_bleu, sentence_count = compute_bleu(audio_set_text_translation_generator(), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", testing_set_bleu)

39
1
40


Unknown (en): tubettes
Unknown (en): petrellis
Unknown (en): lempiras
Unknown (en): zeitlan
Unknown (en): powerwalk
Unknown (en): lempiras
Unknown (en): cholinergic


39
1
40


Unknown (en): zeitlan
Unknown (en): pwen
Unknown (en): rasmalai


39
1
40


Unknown (en): unextraordinary
Unknown (en): for40


39
1
40


Unknown (en): homicidio
Unknown (en): showboated
Unknown (en): annapura
Unknown (en): annapura
Unknown (en): conﬁned
Unknown (en): lllllom


39
1
40


Unknown (en): dopamines
Unknown (en): canmore


Evaluated 6141 samples.
BLEU:  0.11291731515763413


Unknown (en): m.0
