In [33]:
original_token_dir = '../processed_data/lmd_ts_10k/ts1_new_melody_successive_1022_256_tokenization'
output_dir = '../processed_data/lmd_ts_10k/ts1_new_melody_successive_1022_256_translation'

In [34]:
split_names = ('train', 'valid', 'test')

In [35]:
import os
import shutil

In [4]:
# B S O P D O P D
# B S O D O D -> P P
# B S O D - P

def split_sequence_pitch1(ls_list, min_target_len=None):
    src_idx = 0
    tgt_idx = 0
    src_list = []
    tgt_list = []
    align_list = []

    last_b = -1
    last_s = -1
    last_o = -1
    last_p = -1

    for token in ls_list:
        t0 = token[0]
        if t0 == 'b':
            last_b = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 's':
            last_s = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 'o':
            last_o = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 'p':
            last_p = tgt_idx
            tgt_list.append(token)
            tgt_idx += 1
        elif t0 == 'd':
            current_d = src_idx
            src_list.append(token)
            src_idx += 1

            align_list.append('%d-%d' % (last_b, last_p))
            align_list.append('%d-%d' % (last_s, last_p))
            align_list.append('%d-%d' % (last_o, last_p))
            align_list.append('%d-%d' % (current_d, last_p))
        else:
            raise ValueError("Cannot process token: %s" % token)

    if len(tgt_list) == 0 or len(src_list) == 0:
        return None, None, None
    if min_target_len is not None and len(tgt_list) < min_target_len:
        return None, None, None

    return src_list, tgt_list, align_list

In [10]:
# B S O P D O P D
# B S O D O D -> P P
# O D - P

def split_sequence_pitch1p1(ls_list, min_target_len=None):
    src_idx = 0
    tgt_idx = 0
    src_list = []
    tgt_list = []
    align_list = []

    last_o = -1
    last_p = -1

    for token in ls_list:
        t0 = token[0]
        if t0 == 'b':
            src_list.append(token)
            src_idx += 1
        elif t0 == 's':
            src_list.append(token)
            src_idx += 1
        elif t0 == 'o':
            last_o = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 'p':
            last_p = tgt_idx
            tgt_list.append(token)
            tgt_idx += 1
        elif t0 == 'd':
            current_d = src_idx
            src_list.append(token)
            src_idx += 1

            align_list.append('%d-%d' % (last_o, last_p))
            align_list.append('%d-%d' % (current_d, last_p))
        else:
            raise ValueError("Cannot process token: %s" % token)

    if len(tgt_list) == 0 or len(src_list) == 0:
        return None, None, None
    if min_target_len is not None and len(tgt_list) < min_target_len:
        return None, None, None

    return src_list, tgt_list, align_list

In [19]:
# B S O P D O P D
# B S O D O D -> B S O P D O P D
# B - B
# S - S
# O - O
# D - P
# D - D

def split_sequence_pitch2(ls_list, min_target_len=None):
    src_idx = 0
    tgt_idx = 0
    src_list = []
    tgt_list = []
    align_list = []

    last_p = -1

    for token in ls_list:
        t0 = token[0]
        if t0 == 'b':
            src_list.append(token)
            tgt_list.append(token)
            align_list.append('%d-%d' % (src_idx, tgt_idx))
            src_idx += 1
            tgt_idx += 1
        elif t0 == 's':
            src_list.append(token)
            tgt_list.append(token)
            align_list.append('%d-%d' % (src_idx, tgt_idx))
            src_idx += 1
            tgt_idx += 1
        elif t0 == 'o':
            src_list.append(token)
            tgt_list.append(token)
            align_list.append('%d-%d' % (src_idx, tgt_idx))
            src_idx += 1
            tgt_idx += 1
        elif t0 == 'p':
            last_p = tgt_idx
            tgt_list.append(token)
            tgt_idx += 1
        elif t0 == 'd':
            src_list.append(token)
            tgt_list.append(token)
            align_list.append('%d-%d' % (src_idx, last_p))
            align_list.append('%d-%d' % (src_idx, tgt_idx))
            src_idx += 1
            tgt_idx += 1
        else:
            raise ValueError("Cannot process token: %s" % token)

    if len(tgt_list) == 0 or len(src_list) == 0:
        return None, None, None
    if min_target_len is not None and len(tgt_list) - len(src_list) < min_target_len:
        return None, None, None

    return src_list, tgt_list, align_list

In [23]:
# B S O P D O P D
# B S O D O D -> B P P
# B - B
# O D - P

def split_sequence_pitch3(ls_list, min_target_len=None):
    src_idx = 0
    tgt_idx = 0
    src_list = []
    tgt_list = []
    align_list = []
    
    num_pitch = 0

    last_o = -1
    last_p = -1

    for token in ls_list:
        t0 = token[0]
        if t0 == 'b':
            align_list.append('%d-%d' % (src_idx, tgt_idx))
            src_list.append(token)
            tgt_list.append(token)
            src_idx += 1
            tgt_idx += 1
        elif t0 == 's':
            src_list.append(token)
            src_idx += 1
        elif t0 == 'o':
            last_o = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 'p':
            last_p = tgt_idx
            tgt_list.append(token)
            tgt_idx += 1
            num_pitch += 1
        elif t0 == 'd':
            current_d = src_idx
            src_list.append(token)
            src_idx += 1

            align_list.append('%d-%d' % (last_o, last_p))
            align_list.append('%d-%d' % (current_d, last_p))
        else:
            raise ValueError("Cannot process token: %s" % token)

    if len(tgt_list) == 0 or len(src_list) == 0:
        return None, None, None
    if min_target_len is not None and num_pitch < min_target_len:
        return None, None, None

    return src_list, tgt_list, align_list

In [53]:
# B S O P D O P D
# B S P P -> O D O D
# B S P - O D

def split_sequence_duration1(ls_list, min_target_len=None):
    src_idx = 0
    tgt_idx = 0
    src_list = []
    tgt_list = []
    align_list = []

    last_b = -1
    last_s = -1
    last_o_token = -1
    last_p = -1

    for token in ls_list:
        t0 = token[0]
        if t0 == 'b':
            last_b = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 's':
            last_s = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 'o':
            last_o_token = token
        elif t0 == 'p':
            last_p = src_idx
            src_list.append(token)
            src_idx += 1
        elif t0 == 'd':
            current_o = tgt_idx
            tgt_list.append(last_o_token)
            tgt_idx += 1
            current_d = tgt_idx
            tgt_list.append(token)
            tgt_idx += 1

            align_list.append('%d-%d' % (last_b, current_o))
            align_list.append('%d-%d' % (last_s, current_o))
            align_list.append('%d-%d' % (last_p, current_o))

            align_list.append('%d-%d' % (last_b, current_d))
            align_list.append('%d-%d' % (last_s, current_d))
            align_list.append('%d-%d' % (last_p, current_d))
        else:
            raise ValueError("Cannot process token: %s" % token)

    if len(tgt_list) == 0 or len(src_list) == 0:
        return None, None, None
    if min_target_len is not None and len(tgt_list) < min_target_len * 2:
        return None, None, None

    return src_list, tgt_list, align_list

In [47]:
# B S O P D P D
# B S P P -> B S P O D P O D
# B - B
# S - S
# P - O
# P - P
# P - D

def split_sequence_duration2(ls_list, min_target_len=None):
    src_idx = 0
    tgt_idx = 0
    src_list = []
    tgt_list = []
    align_list = []

    last_o_token = None
    last_p_token = None
    
    last_p = -1

    for token in ls_list:
        t0 = token[0]
        if t0 == 'b':
            src_list.append(token)
            tgt_list.append(token)
            align_list.append('%d-%d' % (src_idx, tgt_idx))
            src_idx += 1
            tgt_idx += 1
        elif t0 == 's':
            src_list.append(token)
            tgt_list.append(token)
            align_list.append('%d-%d' % (src_idx, tgt_idx))
            src_idx += 1
            tgt_idx += 1
        elif t0 == 'o':
            last_o_token = token
        elif t0 == 'p':
            last_p_token = token
        elif t0 == 'd':
            current_p = src_idx
            src_list.append(last_p_token)
            src_idx += 1

            current_tgt_p = tgt_idx
            tgt_list.append(last_p_token)
            tgt_idx += 1

            current_o = tgt_idx
            tgt_list.append(last_o_token)
            tgt_idx += 1

            current_d = tgt_idx
            tgt_list.append(token)
            tgt_idx += 1

            align_list.append('%d-%d' % (current_p, current_tgt_p))
            align_list.append('%d-%d' % (current_p, current_o))
            align_list.append('%d-%d' % (current_p, current_d))
        else:
            raise ValueError("Cannot process token: %s" % token)

    if len(tgt_list) == 0 or len(src_list) == 0:
        return None, None, None
    if min_target_len is not None and (len(tgt_list) - len(src_list)) // 2 < min_target_len:
        return None, None, None

    return src_list, tgt_list, align_list

In [24]:
from functools import partial

In [54]:
split_sequence = partial(split_sequence_duration1, min_target_len=10)
real_output_dir = output_dir + '_duration1'

In [51]:
def split_sequences(original_file_path, output_prefix):
    invalid = 0
    src_lists = []
    tgt_lists = []
    align_lists = []
    with open(original_file_path, 'r', encoding='utf-8') as f:
        for l in f:
            ls = l.strip()
            if ls == '':
                continue

            ls_list = ls.split(' ')
            
            src_list, tgt_list, align_list = split_sequence(ls_list)
            
            if src_list is None:
                invalid += 1
                continue

            src_lists.append(' '.join(src_list) + '\n')
            tgt_lists.append(' '.join(tgt_list) + '\n')
            align_lists.append(' '.join(align_list) + '\n')
    
    output_file_src_path = os.path.join(real_output_dir, output_prefix + ".src").replace('\\', '/')
    output_file_tgt_path = os.path.join(real_output_dir, output_prefix + ".tgt").replace('\\', '/')
    output_file_align_path = os.path.join(real_output_dir, output_prefix + ".aln").replace('\\', '/')
    
    dirname = os.path.dirname(output_file_src_path)
    if dirname == '':
        dirname = '.'
    os.makedirs(dirname, exist_ok=True)

    print(len(src_lists), len(tgt_lists), len(align_lists))

    with open(output_file_src_path, 'w', encoding='utf-8') as f2:
        f2.writelines(src_lists)
    with open(output_file_tgt_path, 'w', encoding='utf-8') as f2:
        f2.writelines(tgt_lists)
    with open(output_file_align_path, 'w', encoding='utf-8') as f2:
        f2.writelines(align_lists)
    print('Done', '%d lines skipped' % invalid)

In [55]:
for split_name in split_names:
    original_file_path = os.path.join(original_token_dir, split_name)
    split_sequences(original_file_path, split_name)
shutil.copy(os.path.join(original_token_dir, 'dict.txt'), real_output_dir)

11105 11105 11105
Done 293 lines skipped
3705 3705 3705
Done 106 lines skipped
3639 3639 3639
Done 92 lines skipped


'../processed_data/lmd_ts_10k/ts1_new_melody_successive_1022_256_translation_duration1\\dict.txt'

In [28]:
ls_list = 'b-0 s-7 o-0 p-43 d-8 p-59 d-24 p-62 d-8 o-8 p-50 d-8 p-71 d-8 o-16 p-55 d-8 p-69 d-8 o-24 p-50 d-8 p-67 d-8 b-0 s-7 o-0 p-43 d-8 p-59 d-24 p-62 d-20 o-8 p-50 d-8 o-16 p-55 d-8 o-24 p-50 d-8 p-62 d-4 o-28 p-62 d-4 b-0 s-7 o-0 p-43 d-8 p-59 d-24 p-62 d-8 o-8 p-50 d-8 p-71 d-8 o-16 p-55 d-8 p-69 d-8 o-24 p-50 d-8 p-67 d-8 b-0 s-7 o-0 p-48 d-8 p-60 d-24 p-64 d-24 o-8 p-52 d-8 o-16 p-55 d-8 o-24 p-52 d-8 b-0 s-7 o-0 p-45 d-8 p-60 d-24 p-64 d-8 o-8 p-52 d-8 p-72 d-8 o-16 p-57 d-8 p-71 d-8 o-24 p-52 d-8 p-69 d-8 b-0 s-7 o-0 p-50 d-8 p-62 d-24 p-66 d-24 o-8 p-54 d-8 o-16 p-57 d-8 o-24 p-54 d-8 b-0 s-7 o-0 p-50 d-8 p-66 d-16 p-74 d-8 o-8 p-54 d-8 p-74 d-8 o-16 p-57 d-8 p-66 d-16 p-72 d-8 o-24 p-50 d-8 p-69 d-8 b-0 s-7 o-0 p-43 d-8 p-67 d-24 p-71 d-24 o-8 p-50 d-8 o-16 p-55 d-8 o-24 p-50 d-8 b-0 s-7 o-0 p-43 d-8 p-59 d-24 p-62 d-8 o-8 p-50 d-8 p-71 d-8 o-16 p-55 d-8 p-69 d-8 o-24 p-50 d-8 p-67 d-8 b-0 s-7 o-0 p-43 d-8 p-59 d-24 p-62 d-20 o-8 p-50 d-8 o-16 p-55 d-8 o-24 p-50 d-8 p-62 d-4 o-28 p-62 d-4 b-0 s-7 o-0 p-43 d-8 p-59 d-24 p-62 d-8 o-8 p-50 d-8 p-71 d-8 o-16 p-55 d-8 p-69 d-8 o-24 p-50 d-8 p-67 d-8 b-0 s-7 o-0 p-48 d-8 p-60 d-24 p-64 d-24 o-8 p-52 d-8 o-16 p-55 d-8 o-24 p-52 d-8 b-0 s-7 o-0 p-45 d-8 p-60 d-24 p-64 d-8 o-8 p-52 d-8 p-72 d-8 o-16 p-57 d-8 p-71 d-8 o-24 p-52 d-8 p-69 d-8 b-0 s-7 o-0 p-47 d-8 p-67 d-16 p-74 d-8 o-8 p-50 d-8 p-74 d-8 o-16 p-55 d-8 p-67 d-16 p-74 d-8 o-24 p-50 d-8 p-74 d-8 b-0 s-7 o-0 p-45 d-8 p-66 d-16 p-76 d-8 o-8 p-50 d-8 p-74 d-8 o-16 p-54 d-8 p-66 d-16 p-72 d-8 o-24 p-50 d-8 p-69 d-8 b-0 s-7 o-0 p-43 d-8 p-62 d-16 p-67 d-16 o-8 p-50 d-8 o-16 p-55 d-16 p-67 d-16 p-74 d-16 b-0 s-7 o-0 p-43 d-20 p-67 d-8 p-71 d-8 o-8 p-67 d-8 p-71 d-8 o-16 p-67 d-16 p-71 d-16 o-24 p-50 d-8 b-0 s-7 o-0 p-55 d-20 p-67 d-8 p-71 d-8 o-8 p-67 d-8 p-71 d-8 o-16 p-67 d-16 p-71 d-16 o-24 p-50 d-8 b-0 s-7 o-0 p-43 d-8 p-67 d-16 p-71 d-8 o-8 p-50 d-8 p-74 d-8 o-16 p-48 d-16 p-52 d-16 p-64 d-16 p-67 d-12 o-28 p-69 d-4 b-0 s-7 o-0 p-43 d-8 p-62 d-24 p-71 d-24 o-8 p-50 d-8 o-16 p-55 d-16 b-0 s-7 o-0 p-48 d-8 p-64 d-8 p-72 d-8 o-8 p-52 d-8 p-64 d-8 p-72 d-8 o-16 p-55 d-8 p-64 d-16 p-72 d-12 o-24 p-52 d-8 o-28 p-72 d-4 b-0 s-7 o-0 p-47 d-8 p-62 d-16 p-72 d-8 o-8 p-50 d-8 p-71 d-8 o-16 p-55 d-8 p-62 d-16 p-71 d-8 o-24 p-50 d-8 p-71 d-4 o-28 p-71 d-4 b-0 s-7 o-0 p-45 d-8 p-61 d-16 p-71 d-8 o-8 p-52 d-8 p-69 d-8 o-16 p-57 d-8 p-61 d-16 p-69 d-8 o-24 p-52 d-8 p-71 d-8 b-0 s-7 o-0 p-50 d-8 p-62 d-16 p-69 d-16 o-8 p-54 d-8 o-16 p-57 d-8 p-66 d-16 p-74 d-16 o-24 p-54 d-8'.split(' ')

In [29]:
src_list, tgt_list, align_list = split_sequence_pitch3(ls_list)

In [30]:
print(src_list)

['b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-8', 'o-8', 'd-8', 'd-8', 'o-16', 'd-8', 'd-8', 'o-24', 'd-8', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-20', 'o-8', 'd-8', 'o-16', 'd-8', 'o-24', 'd-8', 'd-4', 'o-28', 'd-4', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-8', 'o-8', 'd-8', 'd-8', 'o-16', 'd-8', 'd-8', 'o-24', 'd-8', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-24', 'o-8', 'd-8', 'o-16', 'd-8', 'o-24', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-8', 'o-8', 'd-8', 'd-8', 'o-16', 'd-8', 'd-8', 'o-24', 'd-8', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-24', 'o-8', 'd-8', 'o-16', 'd-8', 'o-24', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-16', 'd-8', 'o-8', 'd-8', 'd-8', 'o-16', 'd-8', 'd-16', 'd-8', 'o-24', 'd-8', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-24', 'o-8', 'd-8', 'o-16', 'd-8', 'o-24', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-8', 'o-8', 'd-8', 'd-8', 'o-16', 'd-8', 'd-8', 'o-24', 'd-8', 'd-8', 'b-0', 's-7', 'o-0', 'd-8', 'd-24', 'd-20', 'o-8', 'd-8', 'o-16', 'd-8', 'o-2

In [31]:
print(tgt_list)

['b-0', 'p-43', 'p-59', 'p-62', 'p-50', 'p-71', 'p-55', 'p-69', 'p-50', 'p-67', 'b-0', 'p-43', 'p-59', 'p-62', 'p-50', 'p-55', 'p-50', 'p-62', 'p-62', 'b-0', 'p-43', 'p-59', 'p-62', 'p-50', 'p-71', 'p-55', 'p-69', 'p-50', 'p-67', 'b-0', 'p-48', 'p-60', 'p-64', 'p-52', 'p-55', 'p-52', 'b-0', 'p-45', 'p-60', 'p-64', 'p-52', 'p-72', 'p-57', 'p-71', 'p-52', 'p-69', 'b-0', 'p-50', 'p-62', 'p-66', 'p-54', 'p-57', 'p-54', 'b-0', 'p-50', 'p-66', 'p-74', 'p-54', 'p-74', 'p-57', 'p-66', 'p-72', 'p-50', 'p-69', 'b-0', 'p-43', 'p-67', 'p-71', 'p-50', 'p-55', 'p-50', 'b-0', 'p-43', 'p-59', 'p-62', 'p-50', 'p-71', 'p-55', 'p-69', 'p-50', 'p-67', 'b-0', 'p-43', 'p-59', 'p-62', 'p-50', 'p-55', 'p-50', 'p-62', 'p-62', 'b-0', 'p-43', 'p-59', 'p-62', 'p-50', 'p-71', 'p-55', 'p-69', 'p-50', 'p-67', 'b-0', 'p-48', 'p-60', 'p-64', 'p-52', 'p-55', 'p-52', 'b-0', 'p-45', 'p-60', 'p-64', 'p-52', 'p-72', 'p-57', 'p-71', 'p-52', 'p-69', 'b-0', 'p-47', 'p-67', 'p-74', 'p-50', 'p-74', 'p-55', 'p-67', 'p-74', 'p-50

In [32]:
print(align_list)

['0-0', '2-1', '3-1', '2-2', '4-2', '2-3', '5-3', '6-4', '7-4', '6-5', '8-5', '9-6', '10-6', '9-7', '11-7', '12-8', '13-8', '12-9', '14-9', '15-10', '17-11', '18-11', '17-12', '19-12', '17-13', '20-13', '21-14', '22-14', '23-15', '24-15', '25-16', '26-16', '25-17', '27-17', '28-18', '29-18', '30-19', '32-20', '33-20', '32-21', '34-21', '32-22', '35-22', '36-23', '37-23', '36-24', '38-24', '39-25', '40-25', '39-26', '41-26', '42-27', '43-27', '42-28', '44-28', '45-29', '47-30', '48-30', '47-31', '49-31', '47-32', '50-32', '51-33', '52-33', '53-34', '54-34', '55-35', '56-35', '57-36', '59-37', '60-37', '59-38', '61-38', '59-39', '62-39', '63-40', '64-40', '63-41', '65-41', '66-42', '67-42', '66-43', '68-43', '69-44', '70-44', '69-45', '71-45', '72-46', '74-47', '75-47', '74-48', '76-48', '74-49', '77-49', '78-50', '79-50', '80-51', '81-51', '82-52', '83-52', '84-53', '86-54', '87-54', '86-55', '88-55', '86-56', '89-56', '90-57', '91-57', '90-58', '92-58', '93-59', '94-59', '93-60', '95-6