In [30]:
import subprocess
import ast
import os
import json
import sys
import ast
import glob

In [31]:
#for reading desired phrase structure
def parse_tokens_from_phrase_structure(phrase_stucture_file):
    tokens = []
    with open(phrase_stucture_file, 'r') as f:
        for l in f:
            fields = l.split('\t')
            ph_tokens = ast.literal_eval(fields[3])
            tokens.extend(ph_tokens)
    return tokens

#run espeak with mbrola dict to get phonetic trancription of word
def get_phonetic_transcription(word):
    command = 'espeak -v mb-es1 -q --pho "' + word + '"'
    cmd_tokens = command.split()
    output = subprocess.check_output(cmd_tokens)
    output_str = output.decode()
    phon_seq = ''.join([l.split('\t')[0] for l in output_str.split('\n') if l and not l.startswith('_')])
    return phon_seq

In [32]:
#PREPARE PHONETIC LEXICON
#builds a phonetic lexicon out of the words in the structure files in given directory
phonetic_dict = {}

mt_output_dir = '/home/alp/Documents/playground-linux/SugarDub/mt_output'

for root, direc, files in os.walk(mt_output_dir):
    for file in files:
        if file.endswith('structure.txt'):
            print(file)
            words = parse_tokens_from_phrase_structure(os.path.join(root,file))
            for word in words:
                if not word in phonetic_dict.keys():
                    phonetic_dict[word] = get_phonetic_transcription(word)
            

s2_10_0080.translation.structure.txt
s3_2_0019.translation.structure.txt
s2_1_0282.translation.structure.txt
s2_11_0166.translation.structure.txt
s3_2_0168.translation.structure.txt
s3_1_0106.translation.structure.txt
s2_3_0060.translation.structure.txt
s3_11_0082.translation.structure.txt
s2_4_0406.translation.structure.txt
s3_8_0118.translation.structure.txt
s3_16_0084.translation.structure.txt
s2_4_0099.translation.structure.txt
s3_6_0093.translation.structure.txt
s3_10_0372.translation.structure.txt
s2_4_0169.translation.structure.txt
s2_8_0177.translation.structure.txt
s2_3_0204.translation.structure.txt
s2_1_0231.translation.structure.txt
s2_7_0168.translation.structure.txt
s3_6_0155.translation.structure.txt
s3_1_0054.translation.structure.txt
s3_10_0434.translation.structure.txt
s3_10_0017.translation.structure.txt
s2_1_0208.translation.structure.txt
s3_1_0183.translation.structure.txt
s2_1_0350.translation.structure.txt
s3_6_0108.translation.structure.txt
s2_9_0084.translation

In [33]:
#write dict to json file
with open('heroes_eval_phonetic_dict.txt', 'w') as file:
     file.write(json.dumps(phonetic_dict, indent=4)) # use `json.loads` to do the reverse

In [7]:
#read dict from file
with open('heroes_eval_phonetic_dict.txt', 'r') as file:
     phonetic_dict = json.load(file)

In [35]:
def generate_pho(transcription_file, output_file):
    with open(transcription_file, 'r') as f:
        transcription = f.read()
        
    command = 'espeak -v mb-es1 -q --pho --phonout=' + output_file + ' -f ' + transcription_file
    cmd_tokens = command.split()
    output = subprocess.check_output(cmd_tokens)
    output_str = output.decode()
    return output_str

In [36]:
#CREATE DEFAULT PHO FILES
mt_output_dir = '/home/alp/Documents/playground-linux/SugarDub/mt_output/'
default_pho_dir = '/home/alp/Documents/playground-linux/SugarDub/default_pho/'

for root, direc, files in os.walk(mt_output_dir):
    for file in files:
        if file.endswith('transcript.txt'):
            print(file)
            file_id = file.split('.')[0]
            
            pho_out = os.path.join(default_pho_dir, file_id + '.default.pho')
            
            generate_pho(os.path.join(root,file), pho_out)
            

s2_10_0128.translation.transcript.txt
s2_8_0159.translation.transcript.txt
s3_16_0308.translation.transcript.txt
s2_1_0268.translation.transcript.txt
s2_11_0135.translation.transcript.txt
s2_9_0066.translation.transcript.txt
s2_10_0032.translation.transcript.txt
s3_10_0210.translation.transcript.txt
s2_5_0195.translation.transcript.txt
s2_7_0027.translation.transcript.txt
s2_11_0398.translation.transcript.txt
s2_1_0346.translation.transcript.txt
s2_11_0306.translation.transcript.txt
s3_6_0261.translation.transcript.txt
s3_9_0223.translation.transcript.txt
s2_3_0289.translation.transcript.txt
s3_2_0119.translation.transcript.txt
s2_6_0184.translation.transcript.txt
s3_10_0333.translation.transcript.txt
s3_6_0111.translation.transcript.txt
s3_8_0216.translation.transcript.txt
s2_4_0061.translation.transcript.txt
s3_16_0367.translation.transcript.txt
s3_2_0074.translation.transcript.txt
s2_2_0206.translation.transcript.txt
s3_6_0020.translation.transcript.txt
s3_6_0312.translation.transcr

In [37]:
#FUNCTIONS FOR SYNTHESIS BENDING
#main
def bend_pho_to_structure(phrase_structure_file, default_pho_file, output_pho_file, verbose=False):
    phrase_structure = parse_phrase_structure(phrase_structure_file)
    default_phoneme_data = parse_pho(default_pho_file)

    phoneme_seq = get_phoneme_seq(default_phoneme_data)

    if verbose: print(phoneme_seq)

    desired_phrase_info = []
    phrase_boundaries = []
    phrase_durations = []
    phrase_pausings = []
    search_index = 0
    for phrase_tokens, start_time, end_time, pause_after in phrase_structure:

        beginning_index = search_index
        desired_phrase_duration = (end_time - start_time)* 1000	#desired duration
        desired_pause = int(pause_after * 1000)

        if verbose: 
            print("====================")
            print('beginning_index', beginning_index)
        for token_index, token in enumerate(phrase_tokens):
            phoneme_rep = morpheme2phoneme(token, phonetic_dict)

            word_begin = phoneme_seq.find(phoneme_rep, beginning_index)
            if token_index == 0:
                beginning_index = word_begin

            word_end = word_begin + len(phoneme_rep)
            word_length = word_end - word_begin
            if verbose: print("%s %i - %i\t %s"%(token, word_begin, word_end, ''.join(phoneme_seq[word_begin: word_end])))
            search_index = word_end

        default_phrase_duration = get_duration_of_interval(default_phoneme_data, beginning_index, word_end)

        bend_ratio = default_phrase_duration/desired_phrase_duration

        desired_phrase_info.append(((beginning_index, word_end), bend_ratio, desired_pause))
        if verbose: 
            print("====================")
            print("interval", (beginning_index, word_end))
            print("ratio", bend_ratio)
            print("====================\n")
    if verbose: 
        print("--------------------====================--------------------")
        print(desired_phrase_info)


    #bend the durations of the phrases and form the desired phoneme data
    bent_phoneme_data = []
    for (beginning_index, end_index), bend_ratio, pause_after in desired_phrase_info:
        for phoneme_info in default_phoneme_data[beginning_index:end_index]:
            if verbose: print(phoneme_info)
            new_duration = int(float(phoneme_info[1]) / bend_ratio)
            new_phoneme_info = (phoneme_info[0], '%i'%new_duration, phoneme_info[2])
            if verbose: print(new_phoneme_info)
            bent_phoneme_data.append(new_phoneme_info)

        pause_info = ('_', pause_after, [])
        bent_phoneme_data.append(pause_info)
        if verbose: print(pause_info)

    #write bent_phoneme_data to pho file
    with open(output_pho_file, 'w') as f:
        for i, phoneme_info in enumerate(bent_phoneme_data):
            f.write("%s\t%s\t%s"%(phoneme_info[0], phoneme_info[1], ' '.join(phoneme_info[2])))
            if not i == len(bent_phoneme_data):
                f.write("\n")
                
#for reading desired phrase structure
def parse_phrase_structure(phrase_stucture_file):
    phrase_structure = []
    with open(phrase_stucture_file, 'r') as f:
        for l in f:
            fields = l.split('\t')
            start_time = float(fields[0])
            end_time = float(fields[1])
            pause_after = float(fields[2])
            tokens = ast.literal_eval(fields[3])
            phrase_structure.append((tokens, start_time, end_time, pause_after))
    return phrase_structure


#pho file parser for reading default phoneme timings
def parse_pho(pho_file):
    phoneme_data = []
    with open(pho_file, 'r') as f:	
        for l in f:
            l_elems = l.split()
            if l_elems:
                if len(l_elems) == 2:
                    phoneme_data.append((l_elems[0], l_elems[1], []))
                else:
                    phoneme_data.append((l_elems[0], l_elems[1], l_elems[2:]))

    return phoneme_data

def get_phoneme_seq(phoneme_data):
    phoneme_list = []
    for phon_info in phoneme_data:
        phoneme_list.append(phon_info[0])
    return ''.join(phoneme_list)

def morpheme2phoneme(morpheme, phonetic_dict):
    try:
        return phonetic_dict[morpheme]
    except:
        print("%s not in dict"%morpheme) #TODO
        return 0

def get_duration_of_interval(phoneme_data, start_index, end_index):
    duration = 0
    for phoneme_info in phoneme_data[start_index: end_index]:
        duration += int(phoneme_info[1])
    return duration


In [38]:
#bend sample
sample_id = 's3_9_0061'
phrase_structure_file = 'mt_output/' + sample_id + '.translation.structure.txt'
default_pho_file = 'default_pho/' + sample_id + '.default.pho'
output_pho_file = 'bent_pho/' + sample_id + '.bent.pho'
bend_pho_to_structure(phrase_structure_file, default_pho_file, output_pho_file, verbose=True)

bamos__bamos____
beginning_index 0
vamos 0 - 5	 bamos
vamos 0 - 5	 bamos
interval (0, 5)
ratio 0.03936279547790339

[((0, 5), 0.03936279547790339, 0)]
('b', '65', [])
('b', '1651', [])
('a', '67', ['0', '104', '26', '90', '53', '83', '66', '83', '80', '86', '100', '86'])
('a', '1702', ['0', '104', '26', '90', '53', '83', '66', '83', '80', '86', '100', '86'])
('m', '65', [])
('m', '1651', [])
('o', '46', ['0', '88', '20', '90', '40', '92', '59', '94', '80', '96', '100', '96'])
('o', '1168', ['0', '88', '20', '90', '40', '92', '59', '94', '80', '96', '100', '96'])
('s', '140', [])
('s', '3556', [])
('_', 0, [])


In [39]:
#BEND ALL
mt_output_dir = '/home/alp/Documents/playground-linux/SugarDub/mt_output/'
default_pho_dir = '/home/alp/Documents/playground-linux/SugarDub/default_pho/'
bent_pho_dir = '/home/alp/Documents/playground-linux/SugarDub/bent_pho/'

for root, direc, files in os.walk(mt_output_dir):
    for file in files:
        if file.endswith('structure.txt'):
            
            file_id = file.split('.')[0]
            print(file_id)
            
            phrase_structure_file = os.path.join(mt_output_dir, file)
            default_pho_file = os.path.join(default_pho_dir, file_id + '.default.pho')
            bent_pho_out = os.path.join(bent_pho_dir, file_id + '.bent.pho')
                                            
            bend_pho_to_structure(phrase_structure_file, default_pho_file, bent_pho_out, verbose=False)                                


s2_10_0080
s3_2_0019
s2_1_0282
s2_11_0166
s3_2_0168
s3_1_0106
s2_3_0060
s3_11_0082
s2_4_0406
s3_8_0118
s3_16_0084
s2_4_0099
s3_6_0093
s3_10_0372
s2_4_0169
s2_8_0177
s2_3_0204
s2_1_0231
s2_7_0168
s3_6_0155
s3_1_0054
s3_10_0434
s3_10_0017
s2_1_0208
s3_1_0183
s2_1_0350
s3_6_0108
s2_9_0084
s2_11_0152
s2_9_0310
s3_10_0243
s2_2_0113
s2_1_0071
s2_3_0257
s3_8_0094
s2_11_0214
s2_3_0318
s3_2_0284
s3_1_0056
s2_4_0393
s3_6_0261
s3_16_0141
s2_11_0360
s2_5_0189
s2_11_0096
s2_11_0127
s3_16_0377
s3_8_0145
s3_8_0272
s2_11_0270
s2_4_0388
s3_2_0110
s3_9_0051
s2_6_0254
s2_4_0054
s2_1_0159
s3_9_0263
s2_9_0079
s3_16_0312
s3_16_0291
s3_9_0268
s3_12_0085
s3_14_0033
s2_11_0040
s2_1_0366
s2_4_0462
s3_6_0059
s3_2_0336
s3_16_0116
s2_6_0047
s2_6_0323
s3_10_0330
s3_12_0130
s3_16_0139
s2_2_0206
s3_10_0227
s3_8_0046
s2_4_0328
s2_8_0072
s3_6_0334
s2_8_0305
s3_10_0082
s3_9_0265
s2_2_0160
s2_11_0071
s3_10_0334
s2_4_0433
s3_1_0050
s2_1_0141
s2_5_0109
s2_1_0082
s2_8_0133
s2_7_0089
s3_9_0175
s3_8_0370
s2_10_0251
s3_12_0094

In [46]:
#Functions for synthesis
def synthesize_pho(pho_file, output_wav):
    command = 'mbrola ./es1/es1 ' + pho_file + ' ' + output_wav
    cmd_tokens = command.split()
    output = subprocess.check_output(cmd_tokens)
    return output.decode()

In [48]:
out = synthesize_pho('bent_pho/s3_6_0096.bent.pho', 'bent_synth/s3_6_0096.bent.wav')

CalledProcessError: Command '['mbrola', './es1/es1', 'bent_pho/s3_6_0096.bent.pho', 'bent_synth/s3_6_0096.bent.wav']' returned non-zero exit status -6

In [49]:
out

NameError: name 'out' is not defined

In [None]:
#SYNTHESIZE ALL BENT FILES
bent_pho_dir = '/home/alp/Documents/playground-linux/SugarDub/bent_pho/'

for root, direc, files in os.walk(bent_pho_dir):
    for file in files:
            
        file_id = file.split('.')[0]
        print(file_id)
        
        