In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/d/bryanmariscal/activates/activate3.wav
/kaggle/input/d/bryanmariscal/activates/activate5.wav
/kaggle/input/d/bryanmariscal/activates/activate2.wav
/kaggle/input/d/bryanmariscal/activates/activate1.wav
/kaggle/input/d/bryanmariscal/activates/activate4.wav
/kaggle/input/d/bryanmariscal/activates/activate0.wav
/kaggle/input/background-noise/traffic.wav
/kaggle/input/background-noise/silence.wav
/kaggle/input/background-noise/rain.wav
/kaggle/input/background-noise/cafe.wav
/kaggle/input/background-noise/airplane_overhead.wav
/kaggle/input/negatives/negative3.wav
/kaggle/input/negatives/negative5.wav
/kaggle/input/negatives/negative1.wav
/kaggle/input/negatives/negative2.wav
/kaggle/input/negatives/negative4.wav
/kaggle/input/negatives/negative0.wav


In [2]:
import numpy as np
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
import scipy.signal as sig
from scipy.io.wavfile import read
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.ticker import AutoMinorLocator,LogLocator
get_ipython().magic('matplotlib inline')

In [12]:
def create_timeSeries(file,decimate=True):
    samplerate, data = read(file)
    if decimate:
        data = sig.decimate(data,q=4,ftype='iir',zero_phase=False) #downsample by factor of 4. 
        samplerate /= 4 #samplerate becomes 11025 Hz (from 44100 Hz)
    t = np.linspace(0,len(data)/samplerate,len(data),endpoint=False)
    data = np.column_stack((t,data))

    return data

In [5]:
def create_spectrogram(data,nps=256,noverlap=128,plot=True):
    fsample = 1/data[1,0]
    n = int(np.ceil(np.log2(nps)/np.log2(2))) #zero pad to closest power of 2
    zpd = 2**n    
    f,t,Sxx = sig.spectrogram(data[:,1],fsample,window='hann',nperseg=nps,noverlap=noverlap,nfft=zpd)
    if plot:
        tmax = np.ceil(np.log10(np.max(Sxx)))
        tmin = np.log10(10**-3)
        lvls = np.logspace(tmin,tmax,100)
        fig, ax = plt.subplots()
        cont = ax.contourf(t,f,Sxx,norm=LogNorm(),levels=lvls)
        ax.set_ylim(0,5000)
        ax.set_xlabel('Time [seconds]')
        ax.set_ylabel('Frequency [Hz]')
        axins = inset_axes(ax,
                        width="2.5%", # width = 10% of parent_bbox width
                        height="100%", # height : 50%
                        loc=6,
                        bbox_to_anchor=(1.01, 0., 1, 1),
                        bbox_transform=ax.transAxes,
                        borderpad=0,
                    )
        cbar = plt.colorbar(cont,cax=axins,ticks=LogLocator(subs=range(10)))
    return f,t,Sxx


In [6]:
def create_10sec_background_samples(background_files):
    for file in background_files:
        sound = AudioSegment.from_file(base_directory + file)
        if file == 'silence.wav':
            sound.export(file[0:-4] + '.wav', format="wav")
        else:
            sound_length = len(sound)
            segment_start = np.random.randint(low=0,high=sound_length - 10000)
            segment_end = segment_start + 10000
            sound_processed = sound[segment_start:segment_end]
            sound_processed.export(file[0:-4] + '.wav', format="wav")
    return
    

In [8]:
# base_directory = '/kaggle/input/background-noise/'
# background_files = [f for f in os.listdir(base_directory) if not f.startswith('.')]
# create_10sec_background_samples(background_files)


In [7]:
def load_audio():
    background_files = [f for f in os.listdir('/kaggle/input/background-noise/') if not f.startswith('_')]
    background = []
    for file in background_files:
        print(file)
        background.append(AudioSegment.from_file('/kaggle/input/background-noise/' + file))
    
    positive_files = os.listdir('/kaggle/input/d/bryanmariscal/activates/')  
    positives = []
    for file in positive_files:
        print(file)
        positives.append(AudioSegment.from_file('/kaggle/input/d/bryanmariscal/activates/' + file))
        
    negative_files = os.listdir('/kaggle/input/negatives/')  
    negatives = []
    for file in negative_files:
        print(file)
        negatives.append(AudioSegment.from_file('/kaggle/input/negatives/' + file))
    return background,positives,negatives

In [8]:
backgrounds,activates,negatives = load_audio()

traffic.wav
silence.wav
rain.wav
cafe.wav
airplane_overhead.wav
activate3.wav
activate5.wav
activate2.wav
activate1.wav
activate4.wav
activate0.wav
negative3.wav
negative5.wav
negative1.wav
negative2.wav
negative4.wav
negative0.wav


In [9]:
def get_random_time_segment(segment_ms):
    """
    Gets a random time segment of duration segment_ms in a 10,000 ms audio clip.
    
    Arguments:
    segment_ms -- the duration of the audio clip in ms ("ms" stands for "milliseconds")
    
    Returns:
    segment_time -- a tuple of (segment_start, segment_end) in ms
    """
    
    segment_start = np.random.randint(low=0, high=10000-segment_ms)   # Make sure segment doesn't run past the 10sec background 
    segment_end = segment_start + segment_ms - 1
    
    return (segment_start, segment_end)

def is_overlapping(segment_time, previous_segments):
    import timeit
    """
    Checks if the time of a segment overlaps with the times of existing segments.
    
    Arguments:
    segment_time -- a tuple of (segment_start, segment_end) for the new segment
    previous_segments -- a list of tuples of (segment_start, segment_end) for the existing segments
    
    Returns:
    True if the time segment overlaps with any of the existing segments, False otherwise
    """
    
    segment_start, segment_end = segment_time
    
    ### START CODE HERE ### (≈ 4 lines)
    # Step 1: Initialize overlap as a "False" flag. (≈ 1 line)
    overlap = False
    
    # Step 2: loop over the previous_segments start and end times.
    # Compare start/end times and set the flag to True if there is an overlap (≈ 3 lines)
    for previous_start, previous_end in previous_segments:
        if segment_start <= previous_end and segment_end >= previous_start:
            overlap = True
    ### END CODE HERE ###

    return overlap

def insert_audio_clip(background, audio_clip, previous_segments):
    """
    Insert a new audio segment over the background noise at a random time step, ensuring that the 
    audio segment does not overlap with existing segments.
    
    Arguments:
    background -- a 10 second background audio recording.  
    audio_clip -- the audio clip to be inserted/overlaid. 
    previous_segments -- times where audio segments have already been placed
    
    Returns:
    new_background -- the updated background audio
    """
    
    # Get the duration of the audio clip in ms
    segment_ms = len(audio_clip)
    
    ### START CODE HERE ### 
    # Step 1: Use one of the helper functions to pick a random time segment onto which to insert 
    # the new audio clip. (≈ 1 line)
    segment_time = get_random_time_segment(segment_ms)
    
    # Step 2: Check if the new segment_time overlaps with one of the previous_segments. If so, keep 
    # picking new segment_time at random until it doesn't overlap. (≈ 2 lines)
    while is_overlapping(segment_time, previous_segments):
        segment_time = get_random_time_segment(segment_ms)

    # Step 3: Append the new segment_time to the list of previous_segments (≈ 1 line)
    previous_segments.append(segment_time)
    ### END CODE HERE ###
    
    # Step 4: Superpose audio segment and background
    new_background = background.overlay(audio_clip, position = segment_time[0])
    
    return new_background, segment_time

def insert_ones(y, segment_end_ms, Ty):
    """
    Update the label vector y. The labels of the 25 output steps strictly after the end of the segment 
    should be set to 1. By strictly we mean that the label of segment_end_y should be 0 while, the
    50 following labels should be ones.
    
    
    Arguments:
    y -- numpy array of shape (1, Ty), the labels of the training example
    segment_end_ms -- the end time of the segment in ms
    
    Returns:
    y -- updated labels
    """
    
    # duration of the background (in terms of spectrogram time-steps)
    segment_end_y = int(segment_end_ms * Ty / 10000.0)
    
    # Add 1 to the correct index in the background label (y)
    ### START CODE HERE ### (≈ 3 lines)
    for i in range(segment_end_y+1, segment_end_y+25+1):
        if i < Ty:
            y[0, i] = 1
    ### END CODE HERE ###
    
    return y

def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

def create_training_example(background, activates, negatives, Ty=212, file_name=''):
    """
    Creates a training example with a given background, activates, and negatives.
    
    Arguments:
    background -- a 10 second background audio recording
    activates -- a list of audio segments of the word "activate"
    negatives -- a list of audio segments of random words that are not "activate"
    
    Returns:
    x -- the spectrogram of the training example
    y -- the label at each time step of the spectrogram
    """
    
    # Set the random seed
    #np.random.seed(18)
    
    # Make background quieter
    background = background - 20

    ### START CODE HERE ###
    # Step 1: Initialize y (label vector) of zeros (≈ 1 line)
    y = np.zeros((1,Ty))

    # Step 2: Initialize segment times as an empty list (≈ 1 line)
    previous_segments = []
    ### END CODE HERE ###
    
    # Select 0-1 random "activate" audio clips from the entire list of "activates" recordings
    number_of_activates = np.random.randint(0, 4)
    random_indices = np.random.randint(len(activates), size=number_of_activates)
    random_activates = [activates[i] for i in random_indices]
    
    ### START CODE HERE ### (≈ 3 lines)
    # Step 3: Loop over randomly selected "activate" clips and insert in background
    for random_activate in random_activates:
        # Insert the audio clip on the background
        background, segment_time = insert_audio_clip(background, random_activate, previous_segments)
        # Retrieve segment_start and segment_end from segment_time
        segment_start, segment_end = segment_time[0],segment_time[1]
        # Insert labels in "y"
        y = insert_ones(y, segment_end, Ty)
    ### END CODE HERE ###

    # Select 0-2 random negatives audio recordings from the entire list of "negatives" recordings
    number_of_negatives = np.random.randint(0, 4)
    #number_of_negatives = 0
    random_indices = np.random.randint(len(negatives), size=number_of_negatives)
    random_negatives = [negatives[i] for i in random_indices]

    ### START CODE HERE ### (≈ 2 lines)
    # Step 4: Loop over randomly selected negative clips and insert in background
    for random_negative in random_negatives:
        # Insert the audio clip on the background 
        background, _ = insert_audio_clip(background, random_negative, previous_segments)
    ### END CODE HERE ###
    
    # Standardize the volume of the audio clip 
    background = match_target_amplitude(background, -20.0)

    # Export new training example 
    file_handle = background.export("/kaggle/working/synthesized/train" + file_name + ".wav", format="wav")
    #print("File was saved in your directory.")
    
    # Get and plot spectrogram of the new recording (background with superposition of positive and negatives)
    #X is size (# times, # freq)
    data = create_timeSeries('/kaggle/working/synthesized/train'+ file_name +'.wav')
    f,t,x = create_spectrogram(data,nps=256,noverlap=128,plot=False)    
    #x = graph_spectrogram(wav_file='/kaggle/working/synthesized/train'+ file_name +'.wav')
    x = x.T
    
    return x, y




In [10]:
def create_multiple_training_examples(backgrounds, activates, negatives, n, Ty=212,nTime=860,nFreq=129):
    X = np.zeros((n,nTime,nFreq)) #Function of spectrogram input parameters. Can tweak these and assess performance effects.
    Y = np.zeros((n,1,Ty))
    for i in range(n):
        if i%100 == 0:
            print('Iteration '+str(i))
        random_index = np.random.randint(0, len(backgrounds))
        background = backgrounds[random_index]
        x, y = create_training_example(background, activates, negatives, Ty=212, file_name=str(i))        
        X[i,:,:] = x
        Y[i,:,:] = y
    return X,Y

    

In [13]:
!mkdir /kaggle/working/synthesized/
X,Y = create_multiple_training_examples(backgrounds, activates, negatives, n=1000)


mkdir: cannot create directory ‘/kaggle/working/synthesized/’: File exists
Iteration 0
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)
44100
11025.0
(129, 860)


In [None]:
X.shape

In [None]:
Y.shape

In [None]:

np.save('X.npy',X)

In [None]:
np.save('Y.npy',Y)