In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
from os import listdir, makedirs
from os.path import join, isfile, isdir
import pickle

In [3]:
"""
processed_data_loader.py
"""

def load_single_data(relative_data_directory_path, file_name):
    with open(join(relative_data_directory_path, file_name), 'rb') as f:
        data = pickle.load(f)
    return data

In [8]:
"""
process_data.py
"""

def audio_length_equalize_and_save(relative_data_directory_path, relative_save_data_directory_path):
    data_type = relative_data_directory_path.split('/')[-1]
    if data_type == 'data_v1.0':
        sampled_audios_idx = 3
        sample_rates_idx = 4

        # Load data
        data_files = [f for f in listdir(relative_data_directory_path) if isfile(join(relative_data_directory_path, f))]
        data_files.sort()
        total_file_num = len(data_files)
        print('{} file loaded'.format(total_file_num))

        # Find max sampled audio length
        max_sampled_audio_len = 0        
        for data_file in data_files:
            data = load_single_data(relative_data_directory_path, data_file)
            sampled_audios = data[sampled_audios_idx]
            for sampled_audio in sampled_audios:
                curr_sampled_audio_len = len(sampled_audio)
                if curr_sampled_audio_len > max_sampled_audio_len:
                    max_sampled_audio_len = curr_sampled_audio_len
        
        # Modify 'sampled_audios'
        # of 'data_v1.0' to make audio length same
        for i in range(total_file_num):
            print('Processing {}/{}'.format(i+1, total_file_num))
            data = load_single_data(relative_data_directory_path, data_files[i])
            sampled_audios = data[sampled_audios_idx]

            for ii in range(len(sampled_audios)):
                # Add zero padding to 'sampled audio'
                len_zero_padding = max_sampled_audio_len - len(sampled_audios[ii])
                sampled_audios[ii] = np.append(sampled_audios[ii], [0]*len_zero_padding)

            data[sampled_audios_idx] = sampled_audios
            result = save_single_data(relative_save_data_directory_path, data, i+1, data_type)

    elif data_type == 'data_v2.0':
        sampled_audios_idx = 0
        sample_rates_idx = 1
        word_time_intervals_idx = 2

        # Load data
        data_files = [f for f in listdir(relative_data_directory_path) if isfile(join(relative_data_directory_path, f))]
        data_files.sort()
        total_file_num = len(data_files)
        print('{} file loaded'.format(total_file_num))
        
        # Find max sampled audio length
        max_sampled_audio_len = 0        
        for data_file in data_files:
            data = load_single_data(relative_data_directory_path, data_file)
            sampled_audios = data[sampled_audios_idx]
            for i in range(len(sampled_audios)):
                curr_sampled_audio_len = len(sampled_audios[i])
                if curr_sampled_audio_len > max_sampled_audio_len:
                    max_sampled_audio_len = curr_sampled_audio_len
        
        # Modify 'sampled_audios' and 'word_time_intervals'
        # of 'data_v2.0' to make audio length same
        for i in range(total_file_num):
            print('Processing {}/{}'.format(i+1, total_file_num))
            data = load_single_data(relative_data_directory_path, data_files[i])
            sampled_audios = data[sampled_audios_idx]
            sampled_rates = data[sample_rates_idx]
            word_time_intervals = data[word_time_intervals_idx]

            for ii in range(len(sampled_audios)):
                # Add zero padding to 'sampled audio'
                len_zero_padding = max_sampled_audio_len - len(sampled_audios[ii])
                sampled_audios[ii] = np.append(sampled_audios[ii], [0]*len_zero_padding)

                # Add silent part to 'word_time_interval'
                fixed_end_time = round(max_sampled_audio_len/float(sampled_rates[ii]), 3)
                curr_end_time = word_time_intervals[ii][-1][-1]
                word_time_intervals[ii].append(["", curr_end_time, fixed_end_time])

            data[sampled_audios_idx] = sampled_audios
            data[word_time_intervals_idx] = word_time_intervals

            result = save_single_data(relative_save_data_directory_path, data, i+1, data_type)
    else:
        raise ValueError('Unavailable data directory path for audio zero padding')


def save_single_data(relative_save_data_directory_path, data, save_file_num, data_type):
    # Check directiry to save data
    if not isdir(relative_save_data_directory_path):
        makedirs(relative_save_data_directory_path)
    
    # Save
    num_data = len(data[0])
    distrib_num_file = 4
    each_num_data = int(num_data/distrib_num_file)
    start = 0; end = start + each_num_data
    for i in range(1, distrib_num_file):
        file_name = relative_save_data_directory_path + '/senEM_preprocessed_{}.pkl'.format(distrib_num_file*(save_file_num-1)+i)
        print('Saving {} data'.format(distrib_num_file*(save_file_num-1)+i))
        with open(file_name, 'wb') as f:
            if data_type == 'data_v1.0':
                pickle.dump([data[0][start:end], data[1][start:end], data[2][start:end],
                            data[3][start:end], data[4][start:end], data[5][start:end]], f)
            elif data_type == 'data_v2.0':
                pickle.dump([data[0][start:end], data[1][start:end], data[2][start:end]], f)
            else:
                raise ValueError('Unavailable data directory path for saving data')
        start = end
        end += each_num_data

    file_name = relative_save_data_directory_path + '/senEM_preprocessed_{}.pkl'.format(distrib_num_file*save_file_num)
    print('Saving {} data'.format(distrib_num_file*save_file_num))
    with open(file_name, 'wb') as f:
        if data_type == 'data_v1.0':
            pickle.dump([data[0][start:], data[1][start:], data[2][start:],
                        data[3][start:], data[4][start:], data[5][start:]], f)
        elif data_type == 'data_v2.0':
            pickle.dump([data[0][start:], data[1][start:], data[2][start:]], f)
        else:
            raise ValueError('Unavailable data directory path for saving data')
    return True

In [9]:
"""
processed_data_saver.py
"""

def save_data_v1_1(relative_data_directory_path, relative_save_data_directory_path):
    audio_length_equalize_and_save(relative_data_directory_path, relative_save_data_directory_path)

In [10]:
# Save data_v2.1
relative_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v1.0'
relative_save_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v1.1'
save_data_v1_1(relative_data_directory_path, relative_save_data_directory_path)

10 file loaded
Processing 1/10
Saving 1 data
Saving 2 data
Saving 3 data
Saving 4 data
Processing 2/10
Saving 5 data
Saving 6 data
Saving 7 data
Saving 8 data
Processing 3/10
Saving 9 data
Saving 10 data
Saving 11 data
Saving 12 data
Processing 4/10
Saving 13 data
Saving 14 data
Saving 15 data
Saving 16 data
Processing 5/10
Saving 17 data
Saving 18 data
Saving 19 data
Saving 20 data
Processing 6/10
Saving 21 data
Saving 22 data
Saving 23 data
Saving 24 data
Processing 7/10
Saving 25 data
Saving 26 data
Saving 27 data
Saving 28 data
Processing 8/10
Saving 29 data
Saving 30 data
Saving 31 data
Saving 32 data
Processing 9/10
Saving 33 data
Saving 34 data
Saving 35 data
Saving 36 data
Processing 10/10
Saving 37 data
Saving 38 data
Saving 39 data
Saving 40 data


In [12]:
# Check saved data (Debugging)
relative_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v1.1'
file_name = 'senEM_preprocessed_15.pkl'
with open(join(relative_data_directory_path, file_name), 'rb') as f:
    data = pickle.load(f)

print(data[0])
print(data[1])
print(data[2])
print(data[3])
print(data[4])
print(data[5])
print(len(data[0]))

[472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 473, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 474, 475, 475, 475, 475, 475, 475, 475, 475]
[[150.0, 214.0], [150.0, 214.0], [150.0, 214.0], [150.0, 214.0], [150.0, 214.0], [150.0, 214.0], [189.0, 167.0], [189.0, 167.0], [113.0, 145.0], [189.0, 167.0], [189.0, 167.0], [189.0, 167.0], [189.0, 167.0], [123.0, 64.0], [123.0, 64.0], [123.0, 64.0], [123.0, 64.0], [123.0, 64.0], [123.0, 64.0], [123.0, 64.0], [110.0, 137.0], [122.0, 63.0], [122.0, 63.0], [122.0, 63.0], [122.0, 63.0], [122.0, 63.0], [122.0, 63.0], [122.0, 63.0], [188.0, 169.0], [188.0, 169.0], [188.0, 169.0], [110.0, 137.0], [188.0, 169.0], [188.0, 169.0], [188.0, 169.0], [188.0, 169.0], [