In [None]:
import os
import glob
import librosa
import random
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
import time
from pydub import AudioSegment
from scipy.special import comb
%matplotlib inline


# directories
work_dir = os.getcwd()
source_dir = os.path.join(work_dir, "audio_data")
sample_dir = os.path.join(work_dir, "samples")
fdomain_dir = os.path.join(work_dir, "fdomain")

if not os.path.exists(sample_dir):
    os.mkdir(sample_dir)

if not os.path.exists(fdomain_dir):
    os.mkdir(fdomain_dir)


In [None]:
#
# Utilities to organize and generate samples
#

def OneHotEncoding(dim, val):
    count = val.shape[0]
    one_hot = np.zeros((count, dim))
    one_hot[np.arange(count), val] = 1
    return one_hot

def LoadAudioFile(wave_names, sounds):            
    files = glob.glob(os.path.join(source_dir, "*.wav"))
    files.sort()
    max_length = 0
    for f in files:
        sound = AudioSegment.from_file(f, format="wav")
        sounds.append(sound)
        name = os.path.basename(f)
        wave_names.append(name[:name.index('.')])
        print("wave: %s loaded %d samples" % (name, len(sound)))
        max_length = max(max_length, len(sound))
    return max_length

# generate wave sample
def GenerateRandomMixedSamples(wave_names_in, sounds_in, num, length = 6000):    
    files = glob.glob(os.path.join(sample_dir, "*.wav"))
    for f in files:
        os.remove(f)

    assert len(wave_names_in) == len(sounds_in)
    src_wav_cnt = len(wave_names_in)
            
    #exlode mixing comnbinatin
    total_combi = 2 ** src_wav_cnt
    for i in range(1, total_combi):
        for j in range(num):
            name = None
            mix = AudioSegment.silent(duration=length * 2)
            max_start = 0
            for k in range(src_wav_cnt):
                if i & (1 << k) != 0:
                    start = random.randrange(0, len(sounds_in[k]))
                    max_start = max(max_start, start)
                    mix = mix.overlay(sounds_in[k], position=start, loop=True)
                    if name == None:
                        name = wave_names[k]
                    else:
                        name += ("-" + wave_names[k])
            # get the last of the samples
            mix = mix[-length:]
            gain = random.randrange(-10.0, 10.0)
            mix = mix.apply_gain(gain)
            mix.export(os.path.join(sample_dir, name) + "_" + str(j) + ".wav", format="wav")
    return

# from time domain samples to generate frequency domain samples
def GenerateFrequencyDomainSample(source_dir, dest_dir):
    # clean current sample
    files = glob.glob(os.path.join(dest_dir, "*.npy"))
    for f in files:
        os.remove(f)

    start = time.clock()
    files = glob.glob(os.path.join(source_dir, "*.wav"))
    #plt.figure(figsize=(40,60))
    files.sort()
    file_cnt = len(files)
    for i in range(file_cnt):
        file_name = files[i]
        y, sr = librosa.load(file_name)
        #plt.subplot(file_cnt, 1, i + 1)
        #librosa.display.waveplot(y)
        D = librosa.stft(y)
        fdomain_file_name = os.path.join(dest_dir, os.path.splitext(os.path.basename(file_name))[0])
        np.save(fdomain_file_name, D)
        print(fdomain_file_name, D.shape)
        #log_power = librosa.logamplitude(D**2, ref_power=np.max)
        #librosa.display.specshow(log_power, x_axis='time', y_axis='log')
        #np.save(os.path.join(tft_dir, str(i)), D)    
        #plt.title(file_name)  
    #plt.show()
    print("time elapse: %d", time.clock() - start)
    
def LoadFrequencyDomainSamples(directory):
    files = glob.glob(os.path.join(directory, "*.npy"))
    files.sort()
    file_cnt = len(files)
    X = []
    y = []
    for i in range(file_cnt):
        file_name = files[i]
        sample = np.load(file_name)
        print(sample.shape)
        #print(os.path.basename(file_name), sample.shape)
        y.append(ord(os.path.basename(file_name)[0]) - ord('0') - 1)
        X.append(sample.flatten())
    return X, y
    
# plot frequency domain samples
def PlotFrequencyDomainSamples(directory, number_to_plot):
    # load and plot and check if they match
    files = glob.glob(os.path.join(directory, "*.npy"))
    plt.figure(figsize=(40,60))
    files.sort()
    file_cnt = min(len(files), number_to_plot)
    for i in range(file_cnt):
        file_name = files[i]
        D_loaded = np.load(file_name)
        log_power_loaded = librosa.logamplitude(D_loaded**2, ref_power=np.max)
        plt.subplot(file_cnt, 1, i + 1)
        librosa.display.specshow(log_power_loaded, x_axis="time", y_axis='log')
    plt.show()
    
# generate num_samples with random start time and amplitude
def GenerateSample(wave_names_in, sounds_in, num_samples, length):
    files = glob.glob(os.path.join(sample_dir, "*.wav"))
    for f in files:
        os.remove(f)

    assert len(wave_names_in) == len(sounds_in)
    samples_cnt_in = len(wave_names_in)
    for i in range(samples_cnt_in):
        s = sounds_in[i]
        audio_len = len(s)
        repeat_audio = s
        while (len(repeat_audio) < 2 * length):
            repeat_audio = repeat_audio + s
            
        for j in range(num_samples):
            start = random.randrange(0, audio_len)
            end = start + length
            new_name = os.path.join(sample_dir, wave_names_in[i] + "-" + str(j) + ".wav")
            #print("new_name: %s(%d), start: %d, end: %d" % (os.path.basename(new_name), len(repeat_audio), start, end))
            sj = repeat_audio[start:end]
            gain = random.randrange(-10.0, 10.0)
            sj = sj.apply_gain(gain)
            sj.export(new_name, format="wav")
    return

def PrepareData(directory):
    sample, label = LoadFrequencyDomainSamples(directory);
    assert len(sample) == len(label)
    x = np.array(sample)
    y = np.array(label)
    y = OneHotEncoding(7, y)
    print(x.shape, y.shape, y)
    return x, y
  

#not used
def ExplodeCombination(wave_names_in, sounds_in):
    wave_names_out = []
    samples_out = []
    total_combi = 2 ** len(wave_names)
    for i in range(1, total_combi):
        name = None
        mix = AudioSegment.silent(duration=max_length)
        for j in range(len(wave_names)):
            if i & (1 << j) != 0:
                mix = mix.overlay(sounds[j])
                if name == None:
                    name = wave_names[j]
                else:
                    name += ("-" + wave_names[j])
        assert name != None
        #rint(name, mix, len(mix))
        wave_names_out.append(name)
        samples_out.append(mix)
        #mix.export(os.path.join(output_dir, name) + ".wav", format="wav")
    return wave_names_out, samples_out


In [None]:
# generate 7 class samples
random.seed()

#wave names and sound files
wave_names = []
sounds = []
max_length = 0
max_length = LoadAudioFile(wave_names, sounds)
print("max_length=%d" % (max_length))


GenerateSample(wave_names, sounds, 2, 3000)

#generate frequency domain sample
GenerateFrequencyDomainSample(sample_dir, fdomain_dir)

In [None]:
# load 7 class samples, convert to one-hot encoding
full_data, full_label = PrepareData(fdomain_dir)
print("full_data", full_data.shape, "full_label", full_label.shape)

num_total = full_data.shape[0]
num_dim = full_data.shape[1]
assert num_total == full_label.shape[0]

#separate 65% training set, 15% validation set, %20 testing set
num_train = int(num_total * .65)
num_val = int(num_total * .15)
num_test = int(num_total - num_train - num_val)

training_set = full_data[:num_train]
validation_set = full_data[num_train:num_train + num_val]
test_set = full_data[num_train + num_val:]

print("training_set", training_set.shape, "validation_set", validation_set.shape, "test_set", test_set.shape)

training_label = full_label[:num_train]
validation_label = full_label[num_train:num_train + num_val]
test_label = full_label[num_train + num_val:]

print("training_label", training_label.shape, "validation_label", validation_label.shape, "testing_label", test_label.shape)



In [None]:
#Generate real sample
random.seed()


#wave names and sound files
wave_names = []
sounds = []
max_length = 0
max_length = LoadAudioFile(wave_names, sounds)
print("max_length=%d" % (max_length))
GenerateRandomMixedSamples(wave_names, sounds, 2)

#generate frequency domain sample
GenerateFrequencyDomainSample(sample_dir, fdomain_dir)

#plot some samples
PlotFrequencyDomainSamples(fdomain_dir, 10)