In [11]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
! pip install librosa==0.8.0

In [3]:
import librosa
import numpy as np
import pickle
import os
from os import listdir, makedirs
from os.path import isfile, join, isdir

In [4]:
"""
utils.py
"""

def read_script_files(script_dir_path):
    script_files = [f for f in listdir(script_dir_path) if isfile(join(script_dir_path, f))]
    script_files.sort()
    return script_files

def read_script_file_data(script_dir_path, script_file):
    curr_file = open(join(script_dir_path, script_file), 'r')
    curr_file_lines = curr_file.readlines()
    for i in range(len(curr_file_lines)):
        words = curr_file_lines[i].split()[2: ]
        curr_file_lines[i] = ' '.join(words)
    return curr_file_lines

In [5]:
"""
processed_data_loader.py
"""

def load_single_data(relative_data_directory_path, file_name):
    with open(join(relative_data_directory_path, file_name), 'rb') as f:
        data = pickle.load(f)
    return data

In [6]:
"""
process_data.py
"""

def return_mel_spec_single_channel(sampled_audio, sample_rate, n_fft, hop_length, win_length, n_mels, window='hann', log_scale=True):
    audio_mel_spec = librosa.feature.melspectrogram(y=sampled_audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)
    if log_scale:
        audio_mel_spec = librosa.power_to_db(audio_mel_spec)
    return audio_mel_spec

def return_mel_spec_three_channel(sampled_audio, sample_rate, n_fft, hop_length, win_length, n_mels, window='hann', log_scale=True):
    audio_mel_spec = librosa.feature.melspectrogram(y=sampled_audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)
    if log_scale:
        audio_mel_spec = librosa.power_to_db(audio_mel_spec)
    mel_first_derivative = librosa.feature.delta(audio_mel_spec, width=5, axis=-1, mode='interp')
    mel_second_derivative = librosa.feature.delta(mel_first_derivative, width=5, axis=-1, mode='interp')
    audio_mel_feature = np.stack((audio_mel_spec, mel_first_derivative, mel_second_derivative), axis=-1)
    return audio_mel_feature

def word_interval_continualize(len_mel_spec, sample_rate, hop_length, word_time_interval, word_dictionary, word_dictionary_size):
    frame_time_interval = hop_length/float(sample_rate)
    continuous_word_interval = np.ones((word_dictionary_size, len_mel_spec))
    for word_time in word_time_interval:
        word = word_time[0]
        curr_word_start_time = word_time[1]
        curr_word_end_time = word_time[2]

        word = np.array(word_dictionary[word])[:, np.newaxis]
        curr_word_start_index = int(curr_word_start_time/frame_time_interval)
        curr_word_end_index = int(curr_word_end_time/frame_time_interval)
        continuous_word_interval[:, curr_word_start_index:curr_word_end_index] = np.multiply(continuous_word_interval[:, curr_word_start_index:curr_word_end_index], word)
    continuous_word_interval[:, curr_word_end_index:] = word_dictionary[''][:, np.newaxis]
    return continuous_word_interval

def make_word_dictionary(relative_script_directory_path):
    # Check whole word candidates
    script_files = read_script_files(relative_script_directory_path)
    total_words = set()
    num = 0
    for script_file in script_files:
        num += 1
        print('Processing {}/{}'.format(num, len(script_files)))
        curr_file_lines = read_script_file_data(relative_script_directory_path, script_file)
        for curr_file_line in curr_file_lines:
            total_words.update(set(curr_file_line.split()))
    total_words = list(total_words)
    total_words.append("")  # blank
    total_words.append("OOV")  # Out of vocabulary
    total_words.sort()

    # One-hot encoding
    word_dictionary = dict()
    word_dictionary_size = len(total_words)
    for i in range(word_dictionary_size):
        one_hot = np.zeros(word_dictionary_size)
        one_hot[i] = 1
        word_dictionary[total_words[i]] = one_hot
    # print(word_dictionary)
    return word_dictionary, word_dictionary_size


In [7]:
# Set word dictionary (seperated from below block)
relative_script_directory_path = './drive/MyDrive/Speech2Pickup/train_script'
word_dic, word_dic_size = make_word_dictionary(relative_script_directory_path)

Processing 1/478
Processing 2/478
Processing 3/478
Processing 4/478
Processing 5/478
Processing 6/478
Processing 7/478
Processing 8/478
Processing 9/478
Processing 10/478
Processing 11/478
Processing 12/478
Processing 13/478
Processing 14/478
Processing 15/478
Processing 16/478
Processing 17/478
Processing 18/478
Processing 19/478
Processing 20/478
Processing 21/478
Processing 22/478
Processing 23/478
Processing 24/478
Processing 25/478
Processing 26/478
Processing 27/478
Processing 28/478
Processing 29/478
Processing 30/478
Processing 31/478
Processing 32/478
Processing 33/478
Processing 34/478
Processing 35/478
Processing 36/478
Processing 37/478
Processing 38/478
Processing 39/478
Processing 40/478
Processing 41/478
Processing 42/478
Processing 43/478
Processing 44/478
Processing 45/478
Processing 46/478
Processing 47/478
Processing 48/478
Processing 49/478
Processing 50/478
Processing 51/478
Processing 52/478
Processing 53/478
Processing 54/478
Processing 55/478
Processing 56/478
P

In [8]:
"""
processed_data_saver.py
"""

def save_data_v2_2(relative_data_directory_path, relative_script_directory_path, relative_save_data_directory_path, word_dic, word_dic_size, mel_feature_type):
    assert mel_feature_type in ['single', 'three']

    # Read data file
    data_files = [f for f in listdir(relative_data_directory_path) if isfile(join(relative_data_directory_path, f))]
    data_files.sort()

    # Set configuration
    n_fft = 2048
    hop_length = int(n_fft/8)
    win_length = int(n_fft/2)
    n_mels = 40
    sampled_audios_idx = 0
    sample_rates_idx = 1
    word_time_intervals_idx = 2
    num_total_data_count = 0

    # # Due to the delay time in google drive, '/data_v2.2' folder should be already prepared in google drive
    # # Check directiry to save data
    # if not isdir(relative_save_data_directory_path):
    #     makedirs(relative_save_data_directory_path)
    
    # Process needed data
    for i in range(len(data_files)):
        print('Processing {}/{}'.format(i+1, len(data_files)))
        data = load_single_data(relative_data_directory_path, data_files[i])
        num_data = len(data[0])
        for ii in range(num_data):
            num_total_data_count += 1

            if mel_feature_type == 'single':
                mel_spec = return_mel_spec_single_channel(sampled_audio=data[sampled_audios_idx][ii], sample_rate=data[sample_rates_idx][ii], \
                    n_fft=n_fft, hop_length=hop_length, win_length=win_length, n_mels=n_mels)
            elif mel_feature_type == 'three':
                mel_spec = return_mel_spec_three_channel(sampled_audio=data[sampled_audios_idx][ii], sample_rate=data[sample_rates_idx][ii], \
                    n_fft=n_fft, hop_length=hop_length, win_length=win_length, n_mels=n_mels)
            
            word_label = word_interval_continualize(len_mel_spec=mel_spec.shape[1], sample_rate=data[sample_rates_idx][ii], \
                hop_length=hop_length, word_time_interval=data[word_time_intervals_idx][ii], \
                word_dictionary=word_dic, word_dictionary_size=word_dic_size)
            save_file_name = relative_save_data_directory_path + '/senEM_preprocessed_{}.npz'.format(num_total_data_count)
            np.savez(save_file_name, mel_spec, word_label)
        print('Finished processing {} data'.format(num_data))

In [9]:
# Save data_v2.2_single_channel

relative_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v2.1'
relative_save_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v2.2_single_channel'
save_data_v2_2(relative_data_directory_path, relative_script_directory_path, relative_save_data_directory_path, word_dic, word_dic_size, mel_feature_type='single')

Processing 1/40
Finished processing 1250 data
Processing 2/40
Finished processing 1250 data
Processing 3/40
Finished processing 1250 data
Processing 4/40
Finished processing 1250 data
Processing 5/40
Finished processing 87 data
Processing 6/40
Finished processing 87 data
Processing 7/40
Finished processing 87 data
Processing 8/40
Finished processing 88 data
Processing 9/40
Finished processing 1250 data
Processing 10/40
Finished processing 1250 data
Processing 11/40
Finished processing 1250 data
Processing 12/40
Finished processing 1250 data
Processing 13/40
Finished processing 1250 data
Processing 14/40
Finished processing 1250 data
Processing 15/40
Finished processing 1250 data
Processing 16/40
Finished processing 1250 data
Processing 17/40
Finished processing 1250 data
Processing 18/40
Finished processing 1250 data
Processing 19/40
Finished processing 1250 data
Processing 20/40
Finished processing 1250 data
Processing 21/40
Finished processing 1250 data
Processing 22/40
Finished proc

In [16]:
# Save data_v2.2_three_channel

relative_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v2.1'
relative_save_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v2.2_three_channel'
save_data_v2_2(relative_data_directory_path, relative_script_directory_path, relative_save_data_directory_path, word_dic, word_dic_size, mel_feature_type='three')

Processing 1/40
Finished processing 1250 data
Processing 2/40
Finished processing 1250 data
Processing 3/40
Finished processing 1250 data
Processing 4/40
Finished processing 1250 data
Processing 5/40
Finished processing 87 data
Processing 6/40
Finished processing 87 data
Processing 7/40
Finished processing 87 data
Processing 8/40
Finished processing 88 data
Processing 9/40
Finished processing 1250 data
Processing 10/40
Finished processing 1250 data
Processing 11/40
Finished processing 1250 data
Processing 12/40
Finished processing 1250 data
Processing 13/40
Finished processing 1250 data
Processing 14/40
Finished processing 1250 data
Processing 15/40
Finished processing 1250 data
Processing 16/40
Finished processing 1250 data
Processing 17/40
Finished processing 1250 data
Processing 18/40
Finished processing 1250 data
Processing 19/40
Finished processing 1250 data
Processing 20/40
Finished processing 1250 data
Processing 21/40
Finished processing 1250 data
Processing 22/40
Finished proc

In [18]:
a = listdir('/content/drive/MyDrive/Speech2Pickup/data_v2.2_three_channel')
indexs = []
for a_i in a:
  index = a_i.split('.')[0].split('_')[-1]
  indexs.append(int(index))
indexs.sort()
indexs[-1]

40698

In [19]:
len(indexs)

40698

In [None]:
indexs[30000:35000]