In [1]:
from load_music import get_song_names, load_song
from segment import segment_onset
# import extract_features
import librosa
import numpy as np
import IPython.display as ipd

# Give a directory, get all the songs that are valid in there

In [2]:
songs = get_song_names(directory="/Users/benjamindykstra/Music/iTunes/Against Me!/")

In [3]:
songs

['/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/01 T.S.R. (This Shit Rules).m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/02 Cliche Guevara.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/03 Mutiny On The Electronic Bay.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/04 Sink, Florida, Sink.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/05 Slurring The Rhythms.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/06 Rice And Bread.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/07 A Brief Yet Triumphant Intermissi.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/08 Unsubstantiated Rumors Are Good E.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!/As The Eternal Cowboy/09 You Look Like I Need A Drink.m4a',
 '/Users/benjamindykstra/Music/iTunes/Against Me!

# given a list of song file paths, load each song and segment it according to its onset. Shape of final list is (number of songs, ). For each song, shape is (number of segments in song, ). For each segment, shape is (number of audio samples in segment)

In [4]:
accum = []
for song in songs:
    signal, sr = load_song(song)
    onset_segmented = segment_onset(signal, sr, 512)
    print onset_segmented['shape']
    accum.append(onset_segmented['segmented'])
    
all_songs = np.array(accum)
print 'shape of all songs: {}'.format(all_songs.shape)

(161,)
(201,)
(141,)
(508,)
(453,)
(333,)
(349,)
(422,)
(269,)
(655,)
(671,)
(609,)
(820,)
(656,)
(770,)
(522,)
(407,)
(472,)
shape of all songs: (18,)


sanity check that if we concatenate all the segments in a song, it should be equal in length to the original signal

In [6]:
original_signal, sr = load_song(songs[-1])
onset_segmented = segment_onset(original_signal)
data = onset_segmented['segmented']
print data.shape, original_signal.shape

print "difference in shapes: {}".format(original_signal.shape[0] - np.concatenate(data).shape[0])

(472,) (3863748,)
difference in shapes: 0


concatenate the first 10 segments of the first song and play it

In [10]:
first_ten = np.concatenate(all_songs[0][:10])

In [11]:
ipd.Audio(first_ten, rate=sr)

In [12]:
ipd.Audio(np.concatenate(all_songs[0][:4]), rate=sr)

In [13]:
ipd.Audio(np.concatenate(all_songs[0][:5]), rate=sr)

In [14]:
ipd.Audio(np.concatenate(all_songs[0][4:6]), rate=sr)

In [86]:
def mfcc(segment, sr=22050, n_mfcc=20):
    components = librosa.feature.mfcc(y=segment,sr=sr, n_mfcc=n_mfcc ) # 
    return np.mean(components[1:], axis=1)

In [80]:
def zero_crossing_rate(segment):
    rate_vector = librosa.feature.zero_crossing_rate(segment+ 0.0001, frame_length=len(segment))[0] # returns array with shape (1,x)
    return np.mean(rate_vector)

In [120]:
def avg_spectral_centroid(segment):
    '''
    Indicate at which frequency the energy is centered on. Like a weighted mean, weighting avg frequency by the energy.
    '''
    centroid = librosa.feature.spectral_centroid(segment+0.01, sr=sr, n_fft=len(segment))[0]
    return np.mean(centroid)

In [130]:
def avg_spectral_contrast(segment, sr=22050, n_bands=6):
    '''
    considers the spectral peak, the spectral valley, and their difference in each frequency subband
    
    columns correspond to a spectral band
    
    average contrast : np.ndarray [shape=(n_bands + 1)]
    each row of spectral contrast values corresponds to a given
    octave-based frequency, take average across bands
    
    '''
    contr = librosa.feature.spectral_contrast(segment, sr=sr, n_fft=len(segment) n_bands=n_bands)
    return np.mean(contr, axis=1) # take average across bands

In [52]:
def get_features(segment):
    if len(segment) != 0:
        feature_tuple = (avg_energy(segment), avg_mfcc(segment), zero_crossing_rate(segment), avg_spectral_centroid(segment), avg_spectral_contrast(segment))
        all_features = np.concatenate([feat if type(feat) is np.ndarray else np.array([feat]) for feat in feature_tuple])
        return all_features
    return np.zeros((29,))


def avg_energy(segment):
    if len(segment) != 0:
        energy = librosa.feature.rmse(y=segment, frame_length = len(segment))[0]
        # returns (1,t) array, get first element
        return np.mean(energy)

    
def avg_mfcc(segment, sr=22050, n_mfcc=20):
    '''
    Get the average Mel-frequency cepstral coefficients for a segment
    The very first MFCC, the 0th coefficient, does not convey information relevant to the overall shape of the spectrum. 
    It only conveys a constant offset, i.e. adding a constant value to the entire spectrum. We discard it.
    BE SURE TO NORMALIZE
    
    Parameters:
        segment: numpy array, a time series of audio data
        sr: int, sampling rate, default 22050
        n_mfcc: int, the number of cepstral coefficients to return, default 20.
    Returns:
        numpy array of shape (n_mfcc - 1,)
    '''
    if (len(segment) != 0):
        components = librosa.feature.mfcc(y=segment,sr=sr, n_mfcc=n_mfcc ) # return shape (n_mfcc, # frames)

        return np.mean(components[1:], axis=1)


def zero_crossing_rate(segment):
    '''
    Get average zero crossing rate for a segment. Add a small constant to the signal to negate small amount of noise near silent
    periods.
    
    Parameters:
        segment: numpy array, a time series of audio data
    Returns:
        float, average zero crossing rate for the given segment
    '''
   

    rate_vector = librosa.feature.zero_crossing_rate(segment+ 0.0001, frame_length=len(segment))[0] # returns array with shape (1,x)
    return np.mean(rate_vector)


def avg_spectral_centroid(segment, sr=22050):
    '''
    Indicate at which frequency the energy is centered on. Like a weighted mean, weighting avg frequency by the energy.
    Add small constant to audio signal to discard noise from silence
    Parameters:
        segment: numpy array, a time series of audio data
        sr: int, sampling rate
    Returns:
        float, the average frequency which the energy is centered on.
    '''
    centroid = librosa.feature.spectral_centroid(segment+0.01, sr=sr)[0]
    return np.mean(centroid)


def avg_spectral_contrast(segment, sr=22050, n_bands=6):
    '''
    considers the spectral peak, the spectral valley, and their difference in each frequency subband
    
    columns correspond to a spectral band
    
    average contrast : np.ndarray [shape=(n_bands + 1)]
    each row of spectral contrast values corresponds to a given
    octave-based frequency, take average across bands
    
    '''
    contr = librosa.feature.spectral_contrast(segment, sr=sr, n_bands=n_bands)
    return np.mean(contr, axis=1) # take average across bands

    

In [45]:
song = all_songs[-1]

In [53]:
features = []
i = 0
for segment in song:
    print "segment #: {}, shape: {}".format(i, segment.shape)
    feature_vector = get_features(segment)
    features.append(feature_vector)
    i += 1
all_feature_vector = np.array(features)
n_seg = all_feature_vector.shape[0]
feature_length = all_feature_vector[0].shape[0]
all_feature_vector = np.reshape(all_feature_vector, (all_feature_vector.shape[0], fea))

segment #: 0, shape: (1536,)
segment #: 1, shape: (5632,)
segment #: 2, shape: (5632,)
segment #: 3, shape: (6144,)
segment #: 4, shape: (5120,)
segment #: 5, shape: (6144,)
segment #: 6, shape: (4096,)
segment #: 7, shape: (7680,)
segment #: 8, shape: (4608,)
segment #: 9, shape: (2560,)
segment #: 10, shape: (9728,)
segment #: 11, shape: (5632,)
segment #: 12, shape: (5632,)
segment #: 13, shape: (6144,)
segment #: 14, shape: (4096,)
segment #: 15, shape: (7168,)
segment #: 16, shape: (4608,)
segment #: 17, shape: (6656,)
segment #: 18, shape: (6144,)
segment #: 19, shape: (5120,)
segment #: 20, shape: (6656,)
segment #: 21, shape: (4096,)
segment #: 22, shape: (7168,)
segment #: 23, shape: (4096,)
segment #: 24, shape: (7168,)
segment #: 25, shape: (5120,)
segment #: 26, shape: (6656,)
segment #: 27, shape: (4608,)
segment #: 28, shape: (6144,)
segment #: 29, shape: (4608,)
segment #: 30, shape: (6656,)
segment #: 31, shape: (6144,)
segment #: 32, shape: (5120,)
segment #: 33, shape

segment #: 274, shape: (5632,)
segment #: 275, shape: (3072,)
segment #: 276, shape: (13312,)
segment #: 277, shape: (6144,)
segment #: 278, shape: (6656,)
segment #: 279, shape: (16384,)
segment #: 280, shape: (3072,)
segment #: 281, shape: (5632,)
segment #: 282, shape: (14336,)
segment #: 283, shape: (23552,)
segment #: 284, shape: (22016,)
segment #: 285, shape: (12288,)
segment #: 286, shape: (10752,)
segment #: 287, shape: (12288,)
segment #: 288, shape: (10240,)
segment #: 289, shape: (12800,)
segment #: 290, shape: (10752,)
segment #: 291, shape: (11776,)
segment #: 292, shape: (10752,)
segment #: 293, shape: (12800,)
segment #: 294, shape: (10752,)
segment #: 295, shape: (22528,)
segment #: 296, shape: (12288,)
segment #: 297, shape: (5632,)
segment #: 298, shape: (6144,)
segment #: 299, shape: (5120,)
segment #: 300, shape: (5120,)
segment #: 301, shape: (6656,)
segment #: 302, shape: (5120,)
segment #: 303, shape: (12288,)
segment #: 304, shape: (11264,)
segment #: 305, shap

In [56]:
np.reshape(all_feature_vector, (all_feature_vector.shape[0], all_feature_vector[0].shape[0]))

(472, 29)

In [28]:
feature_tuple= (avg_energy(song[0]), np.array([1,2,3]))
concat = [feat if type(feat) is np.ndarray else np.array([feat]) for feat in feature_tuple]
print concat
np.concatenate(concat)

[array([1.345684e-06], dtype=float32), array([1, 2, 3])]


array([1.34568404e-06, 1.00000000e+00, 2.00000000e+00, 3.00000000e+00])

()

In [79]:
zero_crossing_rate(song[50])

(10,)


0.04490017361111111

In [81]:
zero_crossing_rate(song[50])

(10,)


0.052490234375

In [121]:
avg_spectral_centroid(song[50][:2048])

(5,)
[1748.16032803 1760.62026969 1701.48853693 1742.9214373  1786.03957456]


1747.84602930271

In [100]:
cent.shape

()

In [101]:
cent

1693.2882637712785

In [96]:
avg_spectral_centroid(song[50]).shape

(10,)

In [116]:
song[50][:2048]

array([-0.07319131, -0.1618235 , -0.13289955, ..., -0.08992862,
       -0.10405801, -0.09187029], dtype=float32)

In [129]:
np.mean(avg_spectral_contrast(song[50]), axis = 1)

(7, 10)


array([18.06604353, 18.43214227, 21.66997491, 16.20400406, 19.27938096,
       21.40340924, 29.17569621])

In [133]:
avg_spectral_contrast(song[50])

(7, 10)


array([20.57025634, 20.47953106, 20.78916248, 18.45730515, 19.12388027,
       21.65153773, 22.16584386])

In [None]:
np.concatenate()