In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import librosa
import numpy as np
from os import listdir, makedirs
from os.path import join, isfile, isdir
import pickle

In [4]:
## processed_data_loader.py ##

import os
from os import listdir
from os.path import isfile, join, isdir
import pickle

def load_data(relative_data_directory_path):
    total_data = []
    data_files = [f for f in listdir(relative_data_directory_path) if isfile(join(relative_data_directory_path, f))]
    data_files.sort()
    for data_file in data_files:
        with open(join(relative_data_directory_path, data_file), 'rb') as f:
            data_list = pickle.load(f)
            total_data.append(data_list)

            # # Logging for 'Data_v1.0'
            # print('='*20)
            # print(len(data_list))
            # print(len(data_list[0])); print(len(data_list[1])); print(len(data_list[2])); print(len(data_list[3])); print(len(data_list[4])); print(len(data_list[5]))
            # print(data_list[0][0]); print(data_list[1][0]); print(data_list[2][0]); print(data_list[3][0]); print(data_list[4][0]); print(data_list[5][0])

            # # Logging for 'Data_v2.0' and 'Data_v2.1'
            # print('='*20)
            # print(len(data_list))
            # print(len(data_list[0])); print(len(data_list[1])); print(len(data_list[2]))
            # print(data_list[0][0]); print(data_list[1][0]); print(data_list[2][0])
            # print(len(data_list[0][0])); print(len(data_list[0][10])); print(len(data_list[0][40]))
    return total_data

def load_single_data(relative_data_directory_path, file_name):
    with open(join(relative_data_directory_path, file_name), 'rb') as f:
        data = pickle.load(f)
    return data

In [5]:
## utils.py ##

def read_script_files(script_dir_path):
    script_files = [f for f in listdir(script_dir_path) if isfile(join(script_dir_path, f))]
    script_files.sort()
    return script_files

def read_script_file_data(script_dir_path, script_file):
    curr_file = open(join(script_dir_path, script_file), 'r')
    curr_file_lines = curr_file.readlines()
    for i in range(len(curr_file_lines)):
        words = curr_file_lines[i].split()[2: ]
        curr_file_lines[i] = ' '.join(words)
    return curr_file_lines

def read_audio_files(audio_dir_path):
    audio_files = [f for f in listdir(audio_dir_path) if isfile(join(audio_dir_path, f))]
    audio_files.sort()
    return audio_files

In [6]:
## process_data.py ##

def audio_length_equalize_and_save(relative_data_directory_path, relative_save_data_directory_path):
    data_type = relative_data_directory_path.split('/')[-1]
    if data_type == 'data_v1.0':
        sampled_audios_idx = 3
        sample_rates_idx = 4

        # Load data
        data_files = [f for f in listdir(relative_data_directory_path) if isfile(join(relative_data_directory_path, f))]
        data_files.sort()
        total_file_num = len(data_files)
        print('{} file loaded'.format(total_file_num))

        # Find max sampled audio length
        max_sampled_audio_len = 0        
        for data_file in data_files:
            data = load_single_data(relative_data_directory_path, data_file)
            sampled_audios = data[sampled_audios_idx]
            for sampled_audio in sampled_audios:
                curr_sampled_audio_len = len(sampled_audio)
                if curr_sampled_audio_len > max_sampled_audio_len:
                    max_sampled_audio_len = curr_sampled_audio_len
        
        # Modify 'sampled_audios'
        # of 'data_v1.0' to make audio length same
        for i in range(total_file_num):
            print('Processing {}/{}'.format(i+1, total_file_num))
            data = load_single_data(relative_data_directory_path, data_files[i])
            sampled_audios = data[sampled_audios_idx]
            sampled_rates = data[sample_rates_idx]

            for ii in range(len(sampled_audios)):
                # Add zero padding to 'sampled audio'
                len_zero_padding = max_sampled_audio_len - len(sampled_audios[ii])
                sampled_audios[ii].extend([0]*len_zero_padding)

            data[sampled_audios_idx] = sampled_audios
            result = save_single_data(relative_save_data_directory_path, data, i+1)

    elif data_type == 'data_v2.0':
        sampled_audios_idx = 0
        sample_rates_idx = 1
        word_time_intervals_idx = 2

        # Load data
        data_files = [f for f in listdir(relative_data_directory_path) if isfile(join(relative_data_directory_path, f))]
        data_files.sort()
        total_file_num = len(data_files)
        print('{} file loaded'.format(total_file_num))
        
        # Find max sampled audio length
        max_sampled_audio_len = 0        
        for data_file in data_files:
            data = load_single_data(relative_data_directory_path, data_file)
            sampled_audios = data[sampled_audios_idx]
            for sampled_audio in sampled_audios:
                curr_sampled_audio_len = len(sampled_audio)
                if curr_sampled_audio_len > max_sampled_audio_len:
                    max_sampled_audio_len = curr_sampled_audio_len
        
        # Modify 'sampled_audios' and 'word_time_intervals'
        # of 'data_v2.0' to make audio length same
        for i in range(total_file_num):
            print('Processing {}/{}'.format(i+1, total_file_num))
            data = load_single_data(relative_data_directory_path, data_files[i])
            sampled_audios = data[sampled_audios_idx]
            sampled_rates = data[sample_rates_idx]
            word_time_intervals = data[word_time_intervals_idx]

            for ii in range(len(sampled_audios)):
                # Add zero padding to 'sampled audio'
                len_zero_padding = max_sampled_audio_len - len(sampled_audios[ii])
                sampled_audios[ii] = np.append(sampled_audios[ii], [0]*len_zero_padding)

                # Add silent part to 'word_time_interval'
                added_time = round(len_zero_padding/float(sampled_rates[ii]), 3)
                curr_end_time = word_time_intervals[ii][-1][-1]
                word_time_intervals[ii].append(["", curr_end_time, curr_end_time+added_time])

            data[sampled_audios_idx] = sampled_audios
            data[word_time_intervals_idx] = word_time_intervals

            result = save_single_data(relative_save_data_directory_path, data, i+1)
    else:
        raise ValueError('Unavailable data directory path for audio zero padding')


def save_single_data(relative_save_data_directory_path, data, save_file_num):
    # Check directiry to save data
    if not isdir(relative_save_data_directory_path):
        makedirs(relative_save_data_directory_path)
    
    # Save
    file_name = relative_save_data_directory_path + '/senEM_preprocessed_{}.pkl'.format(save_file_num)
    print('Saving {} data'.format(save_file_num))
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)
    return True

In [7]:
## processed_data_saver.py ##

def save_data_v2_1(relative_data_directory_path, relative_save_data_directory_path):
    audio_length_equalize_and_save(relative_data_directory_path, relative_save_data_directory_path)

In [1]:
relative_data_directory_path = '/content/gdrive/My Drive/Speech2Pickup/data_v2.0'
relative_save_data_directory_path = '/content/gdrive/My Drive/Speech2Pickup/data_v2.1'
save_data_v2_1(relative_data_directory_path, relative_save_data_directory_path)

NameError: ignored

10 file loaded
