In [1]:
import string
import re
import os
import sys
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

from utils import *
from models import *
import yaml
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
FIRST_RUN = True

In [2]:
class Options:
    def __init__(self):
        self.model_name = 'alp'

options = Options()
options.output_file = 'test_output/reapos_test_text.txt'
options.params_file = 'params-v1.yaml'
options.use_cuda = False
options.use_validation = True
options.gpu2cpu = True

In [13]:
#LOAD CONFIGURATIONS AND LANGUAGES
USE_CUDA = options.use_cuda
print("Use cuda: %s" %USE_CUDA)

try:
    with open(options.params_file, 'r') as ymlfile:
        config = yaml.load(ymlfile)
except:
    sys.exit("Parameters file missing")

#Setup languages
INPUT_LANG_CODE = config['INPUT_LANG']
OUTPUT_LANG_CODE = config['OUTPUT_LANG']

INPUT_LANG_PUNC_LEVEL = config["INPUT_LANG_PUNC_LEVEL"]
OUTPUT_LANG_PUNC_LEVEL = config["OUTPUT_LANG_PUNC_LEVEL"]

INPUT_LANG_PUNC_LEVEL = 1
OUTPUT_LANG_PUNC_LEVEL = 2

print('Input punc lvl: ', INPUT_LANG_PUNC_LEVEL)
print('Output punc lvl: ', OUTPUT_LANG_PUNC_LEVEL)

if FIRST_RUN: 
    if INPUT_LANG_CODE == 'en' and OUTPUT_LANG_CODE == 'es':
        lang_en = input_lang = Lang(INPUT_LANG_CODE, config["W2V_EN_PATH"], config["DICT_EN_PATH"], punctuation_level=INPUT_LANG_PUNC_LEVEL)
        lang_es = output_lang = Lang(OUTPUT_LANG_CODE, config["W2V_ES_PATH"], config["DICT_ES_PATH"], punctuation_level=OUTPUT_LANG_PUNC_LEVEL)
    elif INPUT_LANG_CODE == 'es' and OUTPUT_LANG_CODE == 'en':
        lang_es = input_lang = Lang(INPUT_LANG_CODE, config["W2V_ES_PATH"], config["DICT_ES_PATH"], punctuation_level=INPUT_LANG_PUNC_LEVEL)
        lang_en = output_lang = Lang(OUTPUT_LANG_CODE, config["W2V_EN_PATH"], config["DICT_EN_PATH"], punctuation_level=OUTPUT_LANG_PUNC_LEVEL)
    FIRST_RUN = False

input_prosody_params = config['INPUT_PROSODY']
if input_prosody_params == None:
    input_prosody_params = []
output_prosody_params = config['OUTPUT_PROSODY']
if output_prosody_params == None:
    output_prosody_params = []    
    
#NETWORK CONFIG
max_seq_length = int(config['MAX_SEQ_LENGTH'])
n_prosody_params = int(config['N_PROSODY_PARAMS'])
input_prosody_params = config['INPUT_PROSODY']
encoder_type = config['ENCODER_TYPE']
attn_model = config['ATTN_MODEL']
hidden_size = int(config['HIDDEN_SIZE'])
n_layers = int(config['N_LAYERS'])

Use cuda: False
Input punc lvl:  1
Output punc lvl:  2


In [5]:
#LOAD DATASETS PATHS
AUDIO_TEST_DATA_PATH = config["AUDIO_TEST_DATA_FILE"]
AUDIO_TRAIN_DATA_PATH = config["AUDIO_TRAIN_DATA_FILE"]
AUDIO_VALIDATION_DATA_PATH = config["AUDIO_VALIDATION_DATA_FILE"]
TEXT_TEST_DATA_PATH = config['TEXT_TEST_DATA_PATH']
AUDIO_ALL_DATA_PATH = "/Users/alp/Movies/heroes/transProse_data/audiodata-v1/transProse_audiodata.txt"
AUDIO_SIGNIFICANT_TEST_DATA_PATH = "/Users/alp/Movies/heroes/transProse_data/audiodata-v2/transProse_audiodata-v2_test_pausesignificant.txt"
AUDIO_PUNKPROSED_TEST_V1_DATA_PATH = "/Users/alp/Movies/heroes/transProse_data/audiodata-v1/transProse_audiodata_test_punkProsed.txt"
AUDIO_PUNKPROSED_TEST_V2_DATA_PATH = "/Users/alp/Movies/heroes/transProse_data/audiodata-v2/transProse_audiodata-v2_test_punkProsed.txt"
AUDIO_PUNKPROSED_TEST_NEW_DATA_PATH = "/Users/alp/Movies/heroes/transProse_data/audiodata-new/transProse_heroes_new_audiodata_test_punkProsed.txt"

print("AUDIO_TEST_DATA_PATH", AUDIO_TEST_DATA_PATH)

AUDIO_TEST_DATA_PATH /Users/alp/Movies/heroes/transProse_data/audiodata-v1/transProse_audiodata_test.txt


In [6]:
#Initialize text models
# text_encoder_path = 'models/5mmheroes_puncdinput_encoder.model'
# text_decoder_path = 'models/5mmheroes_puncdinput_decoder.model'

text_encoder_path = 'models/5mmheroes_puncdinput_encoder.model'
text_decoder_path = 'models/5mmheroes_puncdinput_decoder.model'

print('Input punc lvl: ', INPUT_LANG_PUNC_LEVEL)
print('Output punc lvl: ', OUTPUT_LANG_PUNC_LEVEL)

text_encoder = GenericEncoder(input_lang.vocabulary_size, hidden_size, input_lang.get_weights_matrix(), n_layers)
text_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, input_feed=config['DECODER_INPUT_FEED'])
load_model(text_encoder, text_decoder, text_encoder_path, text_decoder_path, options.gpu2cpu)

Input punc lvl:  2
Output punc lvl:  2
gpu2cpu: True


In [15]:
#Initialize prosodic models
AUDIO_ENCODE_ONLY = True
model_name = "audio_punctuatedin-v1"
prosodic_encoder_path = 'models/' + model_name + '_encoder.model'
prosodic_decoder_path = 'models/' + model_name + '_decoder.model'

if encoder_type == 'sum':
    prosodic_encoder = EncoderRNN_sum_ver(input_lang.vocabulary_size, n_prosody_params, hidden_size, input_lang.get_weights_matrix(), n_layers)
elif encoder_type == 'parallel':
    prosodic_encoder = EncoderRNN_parallel(input_lang.vocabulary_size, n_prosody_params, hidden_size, input_lang.get_weights_matrix(), n_layers)
else:
    sys.exit("Unrecognized encoder type. Check params file. Exiting...")
if AUDIO_ENCODE_ONLY:
    print("AUDIO_ENCODE_ONLY")
    prosodic_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, USE_CUDA=USE_CUDA)
else:
    prosodic_decoder = ProsodicDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, USE_CUDA=USE_CUDA)
    
load_model(prosodic_encoder, prosodic_decoder, prosodic_encoder_path, prosodic_decoder_path, gpu_to_cpu=options.gpu2cpu)

AUDIO_ENCODE_ONLY
gpu2cpu: True


In [8]:
#DATA GENERATORS
#generates data from tab separated file
def text_data_generator(data_path, input_lang, output_lang, stop_at=-1):
    count = 0
    with open(data_path,'r') as inputfile:
        for line in inputfile:
            if not stop_at == -1 and count >= stop_at:
                break
            pair = [sentence.strip() for sentence in line.split('\t')]
            if input_lang.lang_code == 'en':
                in_sentence = pair[0]
                out_sentence = pair[1]
            elif input_lang.lang_code == 'es':
                in_sentence = pair[1]
                out_sentence = pair[0]

            in_sentence_tokens = in_sentence.lower().split()
            out_sentence_tokens = out_sentence.lower().split()

            if input_lang.punctuation_level == 0:
                in_sentence_tokens = remove_punc_tokens(in_sentence_tokens)
            elif input_lang.punctuation_level == 1:
                in_sentence_tokens = remove_punc_tokens(in_sentence_tokens, keep_main_puncs=True)
            if output_lang.punctuation_level == 0:
                out_sentence_tokens = remove_punc_tokens(out_sentence_tokens)
            elif output_lang.punctuation_level == 1:
                out_sentence_tokens = remove_punc_tokens(out_sentence_tokens, keep_main_puncs=True)

            count += 1

            yield in_sentence_tokens, out_sentence_tokens       
            
def audio_data_generator(data_path, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, dummyfy_input_prosody=False, dummyfy_output_prosody=False, stop_at = -1):
    assert not input_lang == output_lang
    audio_data = read_audio_dataset_file(data_path, shuffle=False)

    #start generating samples from the proscript links in the data file
    count = 0
    for segment_data in audio_data:
        if not stop_at == -1 and count >= stop_at:
            break

        es_txt = segment_data[0]
        es_csv = segment_data[1]
        en_txt = segment_data[2]
        en_csv = segment_data[3]
        
        #print(en_csv)
        #print(es_csv)

        if input_lang.lang_code == 'en' and output_lang.lang_code == 'es':
            input_proscript = en_csv
            output_proscript = es_csv
            #input_transcript = read_text_file(en_txt)
            #output_transcript = read_text_file(es_txt)
        elif input_lang.lang_code == 'es' and output_lang.lang_code == 'en':
            input_proscript = es_csv
            output_proscript = en_csv
            #input_transcript = read_text_file(es_txt)
            #output_transcript = read_text_file(en_txt)
            
        if input_lang.punctuation_level == 0:
            input_punc = False
            input_only_main_punc = False
        elif input_lang.punctuation_level == 1:
            input_punc = True
            input_only_main_punc = True
        elif input_lang.punctuation_level == 2:
            input_punc = True
            input_only_main_punc = False
            
        if output_lang.punctuation_level == 0:
            output_punc = False
            output_only_main_punc = False
        elif output_lang.punctuation_level == 1:
            output_punc = True
            output_only_main_punc = True
        elif output_lang.punctuation_level == 2:
            output_punc = True
            output_only_main_punc = False

        in_sentence_tokens, in_prosody_tokens = read_data_from_proscript(input_proscript, input_lang, n_prosody_params, input_prosody_params, punctuation_as_tokens = input_punc, keep_only_main_puncs = input_only_main_punc)
        out_sentence_tokens, out_prosody_tokens = read_data_from_proscript(output_proscript, output_lang, n_prosody_params, output_prosody_params, punctuation_as_tokens = output_punc, keep_only_main_puncs = output_only_main_punc)
    
        if dummyfy_input_prosody:
            in_prosody_tokens = np.zeros_like(in_prosody_tokens)
        if dummyfy_output_prosody:
            out_prosody_tokens = np.zeros_like(out_prosody_tokens)
            
        count += 1
        yield in_sentence_tokens, in_prosody_tokens, out_sentence_tokens, out_prosody_tokens, en_csv, es_csv

In [9]:
#various utilities
def print_prosody(prosody):
    print(np.array(prosody).transpose())
    
def print_tokens_with_pause(tokens, pausevals = [], pauseflags=[], ssml_format=False, min_pause_duration=0.0):
    if pauseflags == [] and not pausevals == []:
        pauseflags = flags_from_value(pausevals)
        
    to_print = ""
    for i, token in enumerate(tokens):
        if not token == "END":
            to_print += token + " "
            if not pauseflags == [] and pauseflags[i]:
                pause_to_print = ""    
                if not ssml_format:
                    pause_to_print += "[P]"
                if not pausevals == []:
                    if pausevals[i] > min_pause_duration:
                        if ssml_format:
                            pause_to_print += '<break time="%ims"/>'%(pausevals[i]*1000)
                else:
                    if ssml_format:
                        pause_to_print += '<break time="0000ms"/>'
                
                to_print += pause_to_print + " "
            
    #print(to_print)
    return to_print
            
def flags_from_value(prosody_seq):
    return [1 if not feature_value == 0.0 else 0 for feature_value in prosody_seq]

def show_attention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

In [None]:
#SEE TEXT DATA
for in_sent, out_sent in text_data_generator(TEXT_TEST_DATA_PATH, input_lang, output_lang):
    print(in_sent)
    print(out_sent)
    exit = input('...')
    if exit == 'q':
        break

In [None]:
#SEE AUDIO DATA
max_f0 = 0.0
min_f0 = 0.0

input_prosody_params = ['pause_after', 'f0_mean', 'i0_mean']
output_prosody_params = ['pause_after', 'f0_mean', 'i0_mean']
n_prosody_params = 3
data_path = AUDIO_VALIDATION_DATA_PATH
for in_sent, in_pros, out_sent, out_pros, in_csv, out_csv in audio_data_generator(data_path, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, dummyfy_input_prosody=False, dummyfy_output_prosody=False):
    if 'scott' in in_sent:
        print(in_csv)
        print(in_sent)
        print(indexes_from_tokens(input_lang, in_sent))
        in_pros = finalize_prosody_sequence(in_pros)
        print_prosody(in_pros)
        print(out_sent)
        print_prosody(out_pros)
#     print(os.path.basename(in_csv).split('.')[0] + '.wav')
#     in_string = print_tokens_with_pause(in_sent)
#     print(in_string)
#     gt_string = print_tokens_with_pause(out_sent, out_pros[:,0], ssml_format=True, min_pause_duration = 0.05)
#     gt_string = print_tokens_with_pause(out_sent)
#     print(gt_string)
    
#     rt = os.path.splitext(os.path.basename(in_csv))[0]
#     ppd = rt + "_punkProsed.csv"
#     full_ppd = os.path.join(path, ppd)
#     print("_\t%s\t_\t%s"%(out_csv, full_ppd))

#     print("%s\t%s"%(' '.join(in_sent), ' '.join(out_sent)))
    
        exit = input('...')
        if exit == 'q':
            break

/Users/alp/Movies/heroes/corpus/heroes_s3_12/spa-eng/segments_eng/heroes_s3_12_eng_aligned_eng0124.csv
['it', "'s", 'all', 'right', ',', 'scott', '.']
[10, 9, 42, 63, 1, 1797, 0, 29998]
[[ 0.      0.      0.      0.03    0.      0.      0.      0.    ]
 [-1.2936 -1.2936 -1.2936  2.1312  2.1312  0.2578  0.2578  0.    ]
 [ 0.0284  0.0284  0.0284  2.3402  2.3402  0.8338  0.8338  0.    ]]
['¿', 'qué', 'lleva', 'eso', '?']
[[0.     0.8    0.     0.     0.    ]
 [0.     0.     4.113  3.0542 3.0542]
 [0.     0.     0.667  0.1085 0.1085]]


In [10]:
#EVALUATORS
#text -> text
def evaluate_text(input_seq_tokens, input_lang, output_lang, encoder, decoder, max_length, USE_CUDA=False):
    input_word_seqs = [indexes_from_tokens(input_lang, input_seq_tokens)]
    
    #make sure sequences are below max_length. 
    input_word_seqs = limit_seqs_to_max(input_word_seqs, max_length)

    input_lengths = [len(input_word_seqs[0])]
    input_word_batch = Variable(torch.LongTensor(input_word_seqs)).transpose(0, 1)

    if USE_CUDA:
        input_word_batch = input_batch.cuda()
        input_prosody_batch = input_prosody_batch.cuda()

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)

    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_word_batch, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([output_lang.token2index(SWT_TOKEN)])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))

    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    # Store output words and attention states
    decoded_words = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder(
                decoder_input, decoder_context, decoder_hidden, encoder_outputs)
        
        decoder_attentions[di,:decoder_attn.size(2)] += decoder_attn.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        #ni = topi[0][0]  #old code
        ni = topi.item()
        if ni == output_lang.token2index(EOS_TOKEN):
            decoded_words.append(EOS_TOKEN)
            break
        else:
            decoded_words.append(output_lang.index2token(ni))

        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

#text+audio -> text+audio
#use prosodic encoder/decoder
def evaluate_audio(input_seq_tokens, input_prosody_tokens, input_lang, output_lang, encoder, decoder, max_length, audio_encode_only, USE_CUDA=False):
    input_word_seqs = [indexes_from_tokens(input_lang, input_seq_tokens)]
    input_prosody_seqs = [finalize_prosody_sequence(input_prosody_tokens)] #put the end token
    
    #make sure sequences are below max_length. 
    input_word_seqs = limit_seqs_to_max(input_word_seqs, max_length)
    input_prosody_seqs = limit_seqs_to_max(input_prosody_seqs, max_length)

    input_lengths = [len(input_word_seqs[0])]
    input_word_batch = Variable(torch.LongTensor(input_word_seqs)).transpose(0, 1)
    input_prosody_batch = Variable(torch.FloatTensor(input_prosody_seqs)).transpose(0, 1)
    
    if USE_CUDA:
        input_word_batch = input_batch.cuda()
        input_prosody_batch = input_prosody_batch.cuda()

    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)

    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_word_batch, input_prosody_batch, input_lengths, None)

    # Create starting vectors for decoder
    decoder_word_input = Variable(torch.LongTensor([output_lang.token2index(SWT_TOKEN)])) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))

    if USE_CUDA:
        decoder_word_input = decoder_word_input.cuda()
        decoder_context = decoder_context.cuda()

    # Store output words and attention states
    decoded_word_seq = []
    decoded_pauseflag_seq = []
    decoded_pausevalue_seq = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)

    # Run through decoder
    decoder_stop = False
    for di in range(max_length):
        if audio_encode_only:
            decoder_output_word, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_word_input, decoder_context, decoder_hidden, encoder_outputs)
        else:
#             decoder_output_word, decoder_output_pauseflag, decoder_output_pausevalue, decoder_context, decoder_hidden, decoder_attention = decoder(
#                 decoder_word_input, decoder_context, decoder_hidden, encoder_outputs
#             )  #FLAGTEST
            
            decoder_output_word, decoder_output_pauseflag, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_word_input, decoder_context, decoder_hidden, encoder_outputs
            )
        
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
            
        # Choose top word from output
        topv, topi = decoder_output_word.data.topk(1)
        ni_word = topi.item()
        if ni_word == output_lang.token2index(EOS_TOKEN):
            decoded_word_seq.append(EOS_TOKEN)
            decoder_stop = True
        else:
            decoded_word_seq.append(output_lang.index2token(ni_word))
        
        if not audio_encode_only:
            # Look at pauseflag output
            topv, topi = decoder_output_pauseflag.data.topk(1)
            ni_pauseflag = topi.item()
            decoded_pauseflag_seq.append(ni_pauseflag)

            # Look at pauseval output
            #predicted_pausevalue = decoder_output_pausevalue.item()
            #decoded_pausevalue_seq.append(unnormalize_value(predicted_pausevalue, 0.0, 10.0))
            #decoded_pausevalue_seq.append(predicted_pausevalue)

        # Next input is chosen word
        if decoder_stop:
            break
        else:
            decoder_word_input = Variable(torch.LongTensor([ni_word]))
            if USE_CUDA: decoder_word_input = decoder_word_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    #return decoded_word_seq, decoded_pauseflag_seq, decoded_pausevalue_seq, decoder_attentions[:di+1, :len(encoder_outputs)]  #FLAGTEST
    return decoded_word_seq, decoded_pauseflag_seq, decoder_attentions[:di+1, :len(encoder_outputs)]  #FLAGTEST

In [None]:
#Text -> text evaluator test
input_sentence = "he 's flying through that glass ?" 
input_seq_tokens = input_sentence.split()
decoded_words, attentions = evaluate_text(input_seq_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
print(readable_from_tokens(decoded_words[:-1]))
show_attention(input_sentence, decoded_words, attentions)
#print(attentions)

In [None]:
#show_attention(readable_from_tokens(in_sentence_tokens), translation_tokens, attentions)
print(attentions.shape)
print(attentions[0])
print(np.argmax(attentions[0]).item())

In [None]:
#Prosodic evaluator test
input_proscript = '/Users/alp/Movies/heroes/corpus/heroes_s2_7/spa-eng/segments_eng/heroes_s2_7_eng_aligned_eng0278.csv'
in_sentence_tokens, in_prosody_tokens = read_data_from_proscript(input_proscript, input_lang, n_prosody_params, input_prosody_params)

translation_tokens, pauseflag_tokens, attentions = evaluate_audio(in_sentence_tokens, in_prosody_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length, audio_encode_only=AUDIO_ENCODE_ONLY)
prosody_translation_string = print_tokens_with_pause(translation_tokens, pauseflags=pauseflag_tokens, ssml_format=True)
    
translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
text_translation_string = print_tokens_with_pause(translation_tokens)

print('---input audio---')
in_sentence_string = print_tokens_with_pause(in_sentence_tokens, in_prosody_tokens[:,0])
print(in_sentence_string)

print('---text translation---')
print(text_translation_string)

print('---prosodic translation---')
print(prosody_translation_string)

In [11]:
data_path = AUDIO_TEST_DATA_PATH
gen = audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params)
print("Loaded", data_path)

Loaded /Users/alp/Movies/heroes/transProse_data/audiodata-v1/transProse_audiodata_test.txt


In [64]:
#Text+prosody -> text+prosody on audio data visualization
print_every = 1
print_count = 0
print_strings_for_prosody_evaluation = []
for in_sentence_tokens, in_prosody_tokens, out_sentence_tokens, out_prosody_tokens, in_csv, out_csv in gen:
    print(in_csv)
#     print(out_csv)
    
    in_string = print_tokens_with_pause(in_sentence_tokens, in_prosody_tokens[:,0], ssml_format=True, min_pause_duration = 0.05)
#     gt_string = print_tokens_with_pause(out_sentence_tokens, out_prosody_tokens[:,0], ssml_format=True, min_pause_duration = 0.05)
    
    translation_tokens, pauseflag_tokens, attentions = evaluate_audio(in_sentence_tokens, in_prosody_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length, audio_encode_only=AUDIO_ENCODE_ONLY)
    prosody_translation_string = print_tokens_with_pause(translation_tokens, pauseflags=pauseflag_tokens, ssml_format=True)

    translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
    text_translation_string = print_tokens_with_pause(translation_tokens)

#     print_string = '%s\t%s\t%s'%(in_string, gt_string, prosody_translation_string)
#     print_strings_for_prosody_evaluation.append(print_string)
    
    print_count += 1
    if print_every > 0 and  print_count % print_every == 0:
#         print("IN")
        print(in_string)
#         print("GT")
#         print(gt_string)
#         print("OUT PROSODY")
        print(prosody_translation_string)
#         print("OUT TEXT")
        print(text_translation_string)
        print("----------------")
        
#         inp = input("...")
#         if inp == 'q':
#             break
#         print("======================================================================")
    
# with open('prosody_eval.txt', 'w') as f:
#     for eval_str in print_strings_for_prosody_evaluation:
#         f.write(eval_str+'\n')

/Users/alp/Movies/heroes/corpus/heroes_s2_1/spa-eng/segments_eng/heroes_s2_1_eng_aligned_eng0012.csv




that they are bound together by a common purpose ? 
¿ que se van a hacer un propósito propósito ? 
¿ que se van a un propósito propósito ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_1/spa-eng/segments_eng/heroes_s2_1_eng_aligned_eng0137.csv
now <break time="1080ms"/> , who said this ? 
¿ quién ha dicho esto ? 
¿ quién ha dicho esto ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_1/spa-eng/segments_eng/heroes_s2_1_eng_aligned_eng0254.csv




why would you ask me that ? 
¿ por qué me preguntas eso ? 
¿ por qué me lo preguntas ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_1/spa-eng/segments_eng/heroes_s2_1_eng_aligned_eng0327.csv
don 't  go , please . 
no te vayas , por favor . 
no te vayas , por favor . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_1/spa-eng/segments_eng/heroes_s2_1_eng_aligned_eng0361.csv
no <break time="660ms"/> . this can 't be . 
no <break time="0000ms"/> . esto no puede ser . 
no . no . esto no puede ser . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_1/spa-eng/segments_eng/heroes_s2_1_eng_aligned_eng0386.csv
i suggest you <break time="410ms"/> disappear as well . 
te sugiero que desaparezcas en tu vida . 
te sugiero desaparecer como mejor . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_1/spa-eng/segments_eng/heroes_s2_1_eng_aligned_eng0444.csv
my <break time="70ms"/> god my  god <break time="820ms"/> , what have  i <break time="120ms"/> done ? 

i <break time="190ms"/> thought you said you were taking care of me ? 
¿ te has dicho que ibas ? 
¿ te has dicho que estabas cuidando de mí ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_3/spa-eng/segments_eng/heroes_s2_3_eng_aligned_eng0324.csv
i 'm <break time="120ms"/> sorry <break time="60ms"/> . are <break time="80ms"/> you breathing ? 
lo siento . ¿ estás practicando ? 
lo siento . ¿ estás practicando ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_3/spa-eng/segments_eng/heroes_s2_3_eng_aligned_eng0329.csv
with <break time="110ms"/> my help <break time="480ms"/> , of course . 
con mi ayuda , por supuesto . 
con mi ayuda , claro . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_3/spa-eng/segments_eng/heroes_s2_3_eng_aligned_eng0375.csv
so how would it feel then if i <break time="170ms"/> do this ? 
¿ y cómo se sentiría si lo hago ? 
¿ y cómo se sentiría si lo hago ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_4/spa-eng/segme

how we gonna do this ? 
¿ cómo vamos a hacer esto ? 
¿ cómo pasaremos esto ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_9/spa-eng/segments_eng/heroes_s2_9_eng_aligned_eng0151.csv
he 's adorable . 
es encantador . 
es encantador . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_9/spa-eng/segments_eng/heroes_s2_9_eng_aligned_eng0182.csv
claire butler <break time="550ms"/> , bring  it on <break time="290ms"/> in 
¡ claire butler , venga ! 
¡ claire butler ! ¡ venga ! 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_9/spa-eng/segments_eng/heroes_s2_9_eng_aligned_eng0345.csv
those <break time="330ms"/> things are awesome . 
esas cosas son increíbles . 
esas cosas son increíbles . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s2_9/spa-eng/segments_eng/heroes_s2_9_eng_aligned_eng0353.csv
please , don 't do this . 
por favor , no lo hagas . 
por favor , no lo haga . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s3_1/spa-eng/segments

no <break time="570ms"/> , i heal , but i always feel everything . 
no <break time="0000ms"/> , me <break time="0000ms"/> curo , pero <break time="0000ms"/> siempre siento . 
no , curo , pero siempre me siento todo . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s3_2/spa-eng/segments_eng/heroes_s3_2_eng_aligned_eng0094.csv
miss strauss ? 
¿ la srta . strauss . 
¿ la srta . strauss ? 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s3_2/spa-eng/segments_eng/heroes_s3_2_eng_aligned_eng0124.csv
you could teach me <break time="700ms"/> . you learned how to use your power and you fought him . 
podrías enseñarme . lo has hecho como usar tu poder y tú le has hecho . 
podrías enseñarme . has aprendido tu poder y lo has hecho . 
----------------
/Users/alp/Movies/heroes/corpus/heroes_s3_2/spa-eng/segments_eng/heroes_s3_2_eng_aligned_eng0127.csv
i can 't <break time="230ms"/> . not me . not now . 
no puedo . yo no puedo . 
no puedo . yo no puedo . 
----------------
/Users/alp/

In [None]:
#Text -> text on text data
def text_set_translation_generator(stop_at = -1, report=False):
    for in_sentence_tokens, out_sentence_tokens in text_data_generator(TEXT_TEST_DATA_PATH, input_lang, output_lang, stop_at):
        translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
        if report:
            print("> %s"%(readable_from_tokens(in_sentence_tokens)))
            print("= %s"%(readable_from_tokens(out_sentence_tokens)))
            print("< %s"%readable_from_tokens(translation_tokens[:-1]))
            print("---")
        
        yield [out_sentence_tokens], translation_tokens[:-1]
        
testing_set_bleu, sentence_count = compute_bleu(text_set_translation_generator(stop_at=10, report=True), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", testing_set_bleu)

In [19]:
#Text -> text on audio data
gold_sentence_tokens = []
predicted_sentence_tokens = []
in_sentences = []
def audio_set_text_translation_generator(evaluation_set, stop_at = -1, report=False, model='text'):
    for in_sentence_tokens, in_prosody_tokens , out_sentence_tokens, out_prosody_tokens, in_csv, out_csv in audio_data_generator(evaluation_set, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, stop_at=stop_at):
        if model == 'text':
            translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
        elif model == 'audio':
            translation_tokens, _, _ = evaluate_audio(in_sentence_tokens, in_prosody_tokens, input_lang, output_lang, prosodic_encoder, prosodic_decoder, max_seq_length, AUDIO_ENCODE_ONLY)
        if report:
            print("> %s"%(readable_from_tokens(in_sentence_tokens)))
            print("= %s"%(readable_from_tokens(out_sentence_tokens)))
            print("< %s"%readable_from_tokens(translation_tokens[:-1]))
            print("---")
            
        in_sentences.append(readable_from_tokens(in_sentence_tokens))
        gold_sentence_tokens.append(out_sentence_tokens)
        predicted_sentence_tokens.append(translation_tokens[:-1])
        yield [out_sentence_tokens], translation_tokens[:-1]
             
#evaluation_set = AUDIO_PUNKPROSED_TEST_NEW_DATA_PATH
evaluation_set = AUDIO_TEST_DATA_PATH

testing_set_bleu, sentence_count = compute_bleu(
    audio_set_text_translation_generator(evaluation_set, report=False, stop_at=-1, model='audio'), 
    max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", testing_set_bleu)

Evaluated 542 samples.
BLEU:  0.20721749327822875


In [28]:
in_sentences = []
for in_sentence_tokens, in_prosody_tokens , out_sentence_tokens, out_prosody_tokens, in_csv, out_csv in audio_data_generator(evaluation_set, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params, stop_at=-1):
    in_string = print_tokens_with_pause(in_sentence_tokens, in_prosody_tokens[:,0], ssml_format=False)
    in_sentences.append(in_string)



In [37]:
# def bleu_generator(pred_audio, gold):
def gen():
    yield [pred_audio], gold

compute_bleu(gen())

(0.0, 1)

In [43]:
for in_sent, pred_text, pred_audio, gold in zip(in_sentences, predicted_sentence_tokens_textmodel, predicted_sentence_tokens_audiomodel, gold_sentence_tokens):
    def gen2():
        yield [pred_text], gold
    def gen1():
        yield [pred_audio], gold
        
    bleu_text = compute_bleu(gen2())[0]
    bleu_audio = compute_bleu(gen1())[0]
    
    if not pred_text == pred_audio and '[P]' in in_sent and bleu_text < bleu_audio:
        print('IN: ', in_sent)
        print('TEXT: ', ' '.join(pred_text))
        print('AUDIO: ', ' '.join(pred_audio))
        print('GOLD: ', ' '.join(gold))
        
        print("-------------------------------------------------------------")

IN:  holy crap , it worked . this is it [P] , this is daphne 's [P] house . 
TEXT:  ¡ basura ! mierda , es la casa de daphne . ¡ es la casa de daphne !
AUDIO:  ¡ coño ! ¡ santo ! ¡ esto es la casa de daphne !
GOLD:  ha funcionado . es aquí . es la casa de daphne .
-------------------------------------------------------------
IN:  but , for now [P] , you just rest [P] . i 'll be right downstairs . 
TEXT:  pero ahora , descansa . enseguida . enseguida . enseguida .
AUDIO:  pero ahora , por ahora , descansa . enseguida . enseguida enseguida .
GOLD:  pero , por ahora , descansa . yo estaré en el salón .
-------------------------------------------------------------
IN:  that 's claire [P] and that 's hiro , hiding behind the ferns . 
TEXT:  claire y ese es hiro , ocultando detrás , escondido de las cometas .
AUDIO:  esa es claire y ese es hiro , ocultando detrás , detrás de las cometas .
GOLD:  esa es claire . y ese es hiro , detrás de los cristales .
---------------------------------------

In [None]:
#BLEU calculation on OpenNMT results
predictions_file = "/Users/alp/phdCloud/playground/OpenNMT-py/heroes_test_v2-pred.txt"
with open(predictions_file) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
openNMT_predictions = [x.strip() for x in content]

def openNMT_translation_generator():
    for gold, pred in zip(gold_sentence_tokens, openNMT_predictions):
        print([gold])
        print(pred.split(" "))
        yield [gold], pred.split(" ")
        
openNMT_bleu, sentence_count = compute_bleu(openNMT_translation_generator(), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", openNMT_bleu)

In [None]:
#Create text translation data from compiled heroes data
output_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/es_goldpuncd.txt"

input_lang.punctuation_level = 2
output_lang.punctuation_level = 2

with open(output_file, 'w') as f:
    for in_sentence_tokens, _ , out_sentence_tokens, _, _, _ in audio_data_generator(AUDIO_TEST_DATA_PATH, input_lang, output_lang, n_prosody_params, input_prosody_params, output_prosody_params):
        #to write tab separated en-es
        #f.write("%s\t%s\n"%(readable_from_tokens(in_sentence_tokens), readable_from_tokens(out_sentence_tokens)))
        #to write only en
        #f.write("%s\n"%(readable_from_tokens(in_sentence_tokens)))
        #to write only es
        f.write("%s\n"%(readable_from_tokens(out_sentence_tokens)))

In [None]:
#BLEU calculation from already translated text files
gold_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/es_goldpuncd.txt"
predictions_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/translated/en_punkProsed_ted_w_transProsed_5mmheroes_unpuncdinput_v1testset.txt"

def textfile_translation_generator(gold_file, predictions_file):
    gold_sentences = []
    predicted_sentences = []
    with open(gold_file, 'r') as f:
        for line in f:
            gold_sentences.append(line.strip().split(" "))
    with open(predictions_file, 'r') as f:
        for line in f:
            predicted_sentences.append(line.strip().split(" "))
    for gold, pred in zip(gold_sentences, predicted_sentences):       
        yield [gold], pred
        
bleu, sentence_count = compute_bleu(textfile_translation_generator(gold_file, predictions_file), max_order=4, smooth=False)

print("Evaluated %i samples."%sentence_count)
print("BLEU: ", bleu)

In [None]:
#Initialize text models COPY
text_encoder_path = 'models/5mmheroes_puncdinput_encoder.model'
text_decoder_path = 'models/5mmheroes_puncdinput_decoder.model'

text_encoder = GenericEncoder(input_lang.vocabulary_size, hidden_size, input_lang.get_weights_matrix(), n_layers)
text_decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.get_weights_matrix(), output_lang.vocabulary_size, n_layers, input_feed=config['DECODER_INPUT_FEED'])
load_model(text_encoder, text_decoder, text_encoder_path, text_decoder_path, options.gpu2cpu)

In [None]:
#Transprosing text files
input_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/en_punkProsed_ted_w.txt"
gold_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/es_goldpuncd.txt"
output_file = "/Users/alp/phdCloud/playground/punkHeroes/out_heroes_v1/translated/en_punkProsed_ted_w_transProsed_5mmheroes_unpuncdinput_v1testset.txt"

stop_at = -1
report = False

gold_sentences = []
predicted_sentences = []
input_sentences = []

#read files
with open(gold_file, 'r') as f:
    for line in f:
        gold_sentences.append(line.strip().split(" "))
with open(input_file, 'r') as f:
    for line in f:
        input_sentences.append(line.strip().split(" "))
#translate
count = 0
for in_sentence_tokens, gold_tokens in zip(input_sentences, gold_sentences):
    translation_tokens, _ = evaluate_text(in_sentence_tokens, input_lang, output_lang, text_encoder, text_decoder, max_seq_length)
    predicted_sentences.append(translation_tokens)
    if report:
        print("> %s"%(readable_from_tokens(in_sentence_tokens)))
        print("= %s"%(readable_from_tokens(gold_tokens)))
        print("< %s"%readable_from_tokens(translation_tokens[:-1]))
        print("---")
    #yield [gold_tokens], translation_tokens

    count += 1
    if count == stop_at:
        break
    if count % 100 == 0:
        print(count)
            
#store translations in a text file
with open(output_file, 'w') as f:
    for token_index, script_tokens in enumerate(predicted_sentences):
        f.write("%s\n" % ' '.join(script_tokens[:-1]))