In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
% pip install librosa==0.8.0



In [1]:
import numpy as np
from os import listdir, remove, makedirs
from os.path import isfile, join, isdir
from scipy import misc as misc
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal as mulnorm
import librosa
import pickle
import random

In [4]:
"""
processed_data_loader.py
"""

def load_single_data(relative_data_directory_path, file_name):
    with open(join(relative_data_directory_path, file_name), 'rb') as f:
        data = pickle.load(f)
    return data

In [5]:
"""
process_data.py
"""

def return_mel_spec_single_channel(sampled_audio, sample_rate, n_fft, hop_length, win_length, n_mels, window='hann', log_scale=True):
    audio_mel_spec = librosa.feature.melspectrogram(y=sampled_audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)
    if log_scale:
        audio_mel_spec = librosa.power_to_db(audio_mel_spec)
    return audio_mel_spec

def return_mel_spec_three_channel(sampled_audio, sample_rate, n_fft, hop_length, win_length, n_mels, window='hann', log_scale=True):
    audio_mel_spec = librosa.feature.melspectrogram(y=sampled_audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)
    if log_scale:
        audio_mel_spec = librosa.power_to_db(audio_mel_spec)
    mel_first_derivative = librosa.feature.delta(audio_mel_spec, width=5, axis=-1, mode='interp')
    mel_second_derivative = librosa.feature.delta(mel_first_derivative, width=5, axis=-1, mode='interp')
    audio_mel_feature = np.stack((audio_mel_spec, mel_first_derivative, mel_second_derivative), axis=-1)
    return audio_mel_feature

In [6]:
"""
preprocess4HGN.py
"""


def preprocess_speech2pickup(relative_data_directory_path, relative_save_data_directory_path):
    if not isdir(relative_save_data_directory_path):
        makedirs(relative_save_data_directory_path)
    
    data_files = [f for f in listdir(relative_data_directory_path) if isfile(join(relative_data_directory_path, f))]
    random.shuffle(data_files)

    img_idx = []
    seq_len = []
    inputs = []
    outputs = []
    sentence = []
    DATA_INDEX_IMG_IDXS = 0
    DATA_INDEX_POSE_OUTPUTS = 1
    DATA_INDEX_SENTENCE_LENS = 2
    DATA_INDEX_SAMPLED_AUDIOS = 3
    DATA_INDEX_SAMPLED_RATES = 4
    DATA_INDEX_TEXT_COMMANDS = 5

    # Set mel spectogram configuration
    n_fft = 2048
    hop_length = int(n_fft/8)
    win_length = int(n_fft/2)
    n_mels = 40
    mel_feature_type = relative_save_data_directory_path.split('_')[-2]

    for data_file in data_files:
        print('Processing {} ..'.format(data_file))
        data = load_single_data(relative_data_directory_path, data_file)

        img_idx.extend(data[DATA_INDEX_IMG_IDXS])
        seq_len.extend(data[DATA_INDEX_SENTENCE_LENS])
        outputs.extend(data[DATA_INDEX_POSE_OUTPUTS])
        sentence.extend(data[DATA_INDEX_TEXT_COMMANDS])

        for i in range(len(data[DATA_INDEX_SAMPLED_RATES])):
            if mel_feature_type == 'single':
                mel_spec = return_mel_spec_single_channel(sampled_audio=data[DATA_INDEX_SAMPLED_AUDIOS][i], sample_rate=data[DATA_INDEX_SAMPLED_RATES][i], \
                                                            n_fft=n_fft, hop_length=hop_length, win_length=win_length, n_mels=n_mels)
            elif mel_feature_type == 'three':
                mel_spec = return_mel_spec_three_channel(sampled_audio=data[DATA_INDEX_SAMPLED_AUDIOS][i], sample_rate=data[DATA_INDEX_SAMPLED_RATES][i], \
                                                            n_fft=n_fft, hop_length=hop_length, win_length=win_length, n_mels=n_mels)
            else:
                raise ValueError('Unsupported mel feature type')
            inputs.append(mel_spec)
    
    """
    # Generate heatmap (delete # to re-generate heatmaps)
    prev_output = [0, 0]
    for i in range(len(outputs)):
        if prev_output[0] != outputs[i][0] or prev_output[1] != outputs[i][1]:
            tmp_heatmap = np.zeros((250, 250))
            tmp_output = outputs[i].astype(int)
            print('generate heatmap at ({}, {})'.format(tmp_output[0], tmp_output[1]))

            tmp_cov = [[500, 0], [0, 500]]
            mvn = mulnorm([tmp_output[1], tmp_output[0]], tmp_cov)

            for k in range(tmp_heatmap.shape[0]):
                for kk in range(tmp_heatmap.shape[1]):
                    tmp_heatmap[k, kk] = mvn.pdf([k, kk])

            tmp_heatmap = tmp_heatmap / np.max(tmp_heatmap)
            np.savez('./data/train_heatmap/{}_{}_{}.npz'.format(img_idx[i], tmp_output[0], tmp_output[1]), tmp_heatmap)

        prev_output = outputs[i]
    """
    
    img_idx = np.asarray(img_idx)[:, np.newaxis]
    seq_len = np.asarray(seq_len)[:, np.newaxis]
    inputs = np.asarray(inputs)
    outputs = np.asarray(outputs)
    print('img_idx shape: {}'.format(np.shape(img_idx)))
    print('seq_len shape: {}'.format(np.shape(seq_len)))
    print('inputs shape: {}'.format(np.shape(inputs)))
    print('outputs shape: {}'.format(np.shape(outputs)))
    print('sentence shape: {}'.format(len(sentence)))

    ### Save preprocessed data
    file_name = 'preprocessed4HGN_speech2pickup.npz'
    file_name = join(relative_save_data_directory_path, file_name)
    np.savez(file_name, img_idx=img_idx, seq_len=seq_len, inputs=inputs, outputs=outputs, sentence=sentence)

In [7]:
# Save data_v1.2

relative_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v1.1'
relative_save_data_directory_path = '/content/drive/MyDrive/Speech2Pickup/data_v1.2_single_channel'
preprocess_speech2pickup(relative_data_directory_path=relative_data_directory_path, relative_save_data_directory_path=relative_save_data_directory_path)

Processing senEM_preprocessed_17.pkl ..
Processing senEM_preprocessed_12.pkl ..
Processing senEM_preprocessed_25.pkl ..
Processing senEM_preprocessed_21.pkl ..
Processing senEM_preprocessed_36.pkl ..
Processing senEM_preprocessed_40.pkl ..
Processing senEM_preprocessed_29.pkl ..
Processing senEM_preprocessed_37.pkl ..
Processing senEM_preprocessed_6.pkl ..
Processing senEM_preprocessed_8.pkl ..
Processing senEM_preprocessed_5.pkl ..
Processing senEM_preprocessed_14.pkl ..
Processing senEM_preprocessed_28.pkl ..
Processing senEM_preprocessed_24.pkl ..
Processing senEM_preprocessed_10.pkl ..
Processing senEM_preprocessed_33.pkl ..
Processing senEM_preprocessed_7.pkl ..
Processing senEM_preprocessed_27.pkl ..
Processing senEM_preprocessed_4.pkl ..
Processing senEM_preprocessed_2.pkl ..
Processing senEM_preprocessed_22.pkl ..
Processing senEM_preprocessed_32.pkl ..
Processing senEM_preprocessed_20.pkl ..
Processing senEM_preprocessed_18.pkl ..
Processing senEM_preprocessed_34.pkl ..
Proces

In [12]:
data = np.load('/content/drive/MyDrive/Speech2Pickup/data_v1.2_single_channel/preprocessed4HGN_speech2pickup.npz')
print(np.max(data['img_idx']))
print(data['seq_len'][0])
print(data['inputs'][0])
print(data['outputs'][0])
print(data['sentence'][0])

477
