In [38]:
import time
import os
import ast

import IPython.display as ipd
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd

import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape, BatchNormalization, Dropout

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler, LabelBinarizer
from sklearn.linear_model import LogisticRegression

from sklearn import linear_model


import matplotlib.pyplot as plt
import itertools

import utils
from utils import plot_confusion_matrix

### Constants

In [39]:
AUDIO_DIR = "..\\fma_small"
META_DIR = "..\\fma_metadata"
SUBSET = 'small'

### Load data

In [40]:
# Load metadata to memory
def load_meta_data(): 
    tracks_all   = utils.load(META_DIR + '\\tracks.csv')
    features_all = utils.load(META_DIR + '\\features.csv')
    echonest_all = utils.load(META_DIR + '\\echonest.csv')

    #genres = utils.load(META_DIR + 'genres.csv')

    np.testing.assert_array_equal(features_all.index, tracks_all.index)
    assert echonest_all.index.isin(tracks_all.index).all()
    
    
    return tracks_all, features_all, echonest_all

In [41]:
# Choose Subset
def choose_small_subset(tracks_all, features_all, echonest_all):
    subset = tracks_all.index[tracks_all['set', 'subset'] <= 'small']

    assert subset.isin(tracks_all.index).all()
    assert subset.isin(features_all.index).all()
    
    tracks = tracks_all.loc[subset]
    features = features_all.loc[subset]

    return tracks, features
    

In [42]:
tracks_all, features_all, echonest_all = load_meta_data()
tracks, features =  choose_small_subset(tracks_all, features_all, echonest_all)

  'category', categories=SUBSETS, ordered=True)


In [43]:
tracks.shape, features.shape

((8000, 52), (8000, 518))

### Split Train Val Test

In [44]:
# Splitting into Train, Validation, Test
train_index = tracks.index[tracks['set', 'split'] == 'training']
val_index   = tracks.index[tracks['set', 'split'] == 'validation']
test_index  = tracks.index[tracks['set', 'split'] == 'test']


print('{} training examples'.format(len(train_index)))
print('{} validation examples'.format(len(val_index)))
print('{} testing examples'.format(len(test_index)))

6400 training examples
800 validation examples
800 testing examples


In [45]:
X = features.values
Y = tracks['track']['genre_top']

In [46]:
Xtrain = features.loc[train_index].values
Xval  = features.loc[val_index].values
Xtest  = features.loc[test_index].values

Ytrain = tracks.loc[train_index]['track']['genre_top'].values
Yval = tracks.loc[val_index]['track']['genre_top'].values
Ytest = tracks.loc[test_index]['track']['genre_top'].values

In [47]:
classes = list(set(Y))

### Logistic Regression Baseline (Works Well)

In [48]:
Xtrain = features.loc[train_index].values
Xval  = features.loc[val_index].values
Xtest  = features.loc[test_index].values

Ytrain = tracks.loc[train_index]['track']['genre_top']
Yval = tracks.loc[val_index]['track']['genre_top']
Ytest = tracks.loc[test_index]['track']['genre_top']

In [49]:
logreg = linear_model.LogisticRegression(verbose=2, max_iter=40)

In [50]:
logreg.fit(Xtrain, Ytrain)

[LibLinear]

KeyboardInterrupt: 

In [None]:
YtrainHat = logreg.predict(Xtrain)
np.mean(Ytrain == YtrainHat)

In [None]:
YtestHat = logreg.predict(Xtest)
np.mean(Ytest == YtestHat)

In [None]:
YvalHat = logreg.predict(Xval)
np.mean(Yval == YvalHat)

In [None]:
cnf_matrix = confusion_matrix(Ytest, YtestHat)
plot_confusion_matrix(cnf_matrix, classes)
plt.show()

### Deep Learning Model No Audio (broken model predicts all same genre)

In [None]:
features = 518
genres = 8

In [None]:
labelBinarizer = LabelBinarizer()
ohTrain = labelBinarizer.fit_transform(X=Ytrain)
ohVal  = labelBinarizer.fit_transform(X=Yval)

In [60]:
def init_env_and_tfboard(model_name = "Model"):
    from datetime import datetime
    now = datetime.now()

    keras.backend.clear_session()
    TFboard = keras.callbacks.TensorBoard(log_dir='./logs/' + model_name + '_' + now.strftime("%Y%m%d-%H%M%S")  +'/', histogram_freq = 0)
    return TFboard

In [41]:
from keras.models import Sequential

TFBoard = init_env_and_tfboard()

model = Sequential([
    Dense(256, input_shape=(features,), 
          kernel_initializer='random_uniform', bias_initializer='zeros'),
    BatchNormalization(),
    Activation('relu'),
    Dense(128),
    Dropout(0.8),
    Activation('relu'),
    Dense(genres),
    Activation('softmax'),
])    
    
optimizer = keras.optimizers.Adam(lr=0.01)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#model.compile(optimizer, loss='mean_squared_error', metrics=['accuracy','categorical_accuracy'])

In [44]:
features_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106574 entries, 2 to 155320
Columns: 518 entries, (chroma_cens, kurtosis, 01) to (zcr, std, 01)
dtypes: float64(518)
memory usage: 427.0 MB


In [42]:
params = {
        "callbacks": [TFBoard],
        "validation_data": (Xval, ohVal)
}
history = model.fit(x = Xtrain, y = ohTrain , epochs=100, **params, verbose = 0)

In [50]:
labelBinarizer.inverse_transform(model.predict(Xtrain))

array(['Electronic', 'Electronic', 'Electronic', ..., 'Electronic',
       'Electronic', 'Electronic'],
      dtype='<U13')

In [None]:
from collections import Counter
Counter(Ytrain)

### Deep Learning Model Using Audio

In [49]:
trainIDs = tracks.loc[train_index]["track"].index.values
valIDs  = tracks.loc[val_index]["track"].index.values
testIDs  = tracks.loc[test_index]["track"].index.values

In [50]:
labels_onehot = MultiLabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)

In [None]:
# Sanitation Test, Just making sure it works
utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())
SampleLoader(trainIDs, batch_size=2).__next__()[0].shape

In [66]:
# testing librosa
librosaLoader = utils.LibrosaLoader()
t2 = librosaLoader.load(utils.get_audio_path(AUDIO_DIR, 2))
t5 = librosaLoader.load(utils.get_audio_path(AUDIO_DIR, 5))

In [None]:
params = {
    
}

In [None]:
def basic_fully_connected(loader, labels_onehot):
    model = keras.models.Sequential()
    model.add(Dense(units=1000, input_shape=loader.shape))
    model.add(Activation("relu"))
    model.add(Dense(units=100))
    model.add(Activation("relu"))
    model.add(Dense(units=labels_onehot.shape[1]))
    model.add(Activation("softmax"))
    
    optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
    model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [68]:
loader = utils.LibrosaLoader()
librosaBatchLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))

Dimensionality: (1321967,)


In [None]:
keras.backend.clear_session()

model = basic_fully_connected(loader, labels_onehot)
model.fit_generator(SampleLoader(train, batch_size=64), train.size/100, epochs=2, **params)

In [None]:
loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)

In [79]:
import _pickle as pickle
import pandas as pd

df = pd.read_pickle("../test_samples/2test.p")


In [70]:
with open("../test_samples/2test.p",'rb') as infile:
    df = pickle.load(infile)

In [85]:
df["raw_songs"].values[0].shape

(330780,)

### CNN using the 128x128 patches


In [3]:
SPEC_DIR = "..\\spectrogram\\"
batch_size = 32

In [34]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

def cnn_model(input_shape=(105,105,3), output = 8):
    
    model = Sequential()
    model.add(Conv2D(32, (3, 3), input_shape=input_shape))
    model.add(Activation('relu'))    
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(output))
    model.add(Activation('sigmoid'))
    
    model.compile(loss='categorical_crossentropy',  
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model
    

In [19]:

from keras.preprocessing.image import ImageDataGenerator

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        SPEC_DIR + 'train',  
        target_size=(105, 105),
        batch_size=batch_size)

val_generator = test_datagen.flow_from_directory(
        SPEC_DIR + 'val',
        target_size=(105, 105),
        batch_size=batch_size)


Found 63970 images belonging to 8 classes.
Found 8000 images belonging to 8 classes.


In [35]:
model = cnn_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_14 (Conv2D)           (None, 103, 103, 32)      896       
_________________________________________________________________
activation_12 (Activation)   (None, 103, 103, 32)      0         
Total params: 896
Trainable params: 896
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_14 (Conv2D)           (None, 103, 103, 32)      896       
_________________________________________________________________
activation_12 (Activation)   (None, 103, 103, 32)      0         
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 51, 51, 32)        0         
_________________________________________________________________
conv2d_15 (C

In [66]:
TFBoard = init_env_and_tfboard("cnn")
model = cnn_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 103, 103, 32)      896       
_________________________________________________________________
activation_1 (Activation)    (None, 103, 103, 32)      0         
Total params: 896
Trainable params: 896
Non-trainable params: 0
_________________________________________________________________


In [68]:
model.load_weights('cnn_try_1.h5')

In [69]:
params = {
        "callbacks": [TFBoard],
        "verbose": 2
}

model.fit_generator(train_generator, 
                    steps_per_epoch = 10000 / batch_size, 
                    epochs = 50,
                    validation_data=val_generator,
                    validation_steps=800 // batch_size,
                    **params)
             

Epoch 1/50
 - 106s - loss: 1.5624 - acc: 0.4530 - val_loss: 1.4716 - val_acc: 0.4800
Epoch 2/50
 - 104s - loss: 1.5468 - acc: 0.4612 - val_loss: 1.5132 - val_acc: 0.4750
Epoch 3/50
 - 104s - loss: 1.5450 - acc: 0.4639 - val_loss: 1.4463 - val_acc: 0.5112
Epoch 4/50
 - 110s - loss: 1.5277 - acc: 0.4661 - val_loss: 1.5467 - val_acc: 0.5012
Epoch 5/50
 - 106s - loss: 1.5157 - acc: 0.4729 - val_loss: 1.4235 - val_acc: 0.5312
Epoch 6/50
 - 105s - loss: 1.5292 - acc: 0.4636 - val_loss: 1.5892 - val_acc: 0.4537
Epoch 7/50
 - 105s - loss: 1.4913 - acc: 0.4816 - val_loss: 1.5378 - val_acc: 0.4625
Epoch 8/50
 - 105s - loss: 1.5133 - acc: 0.4792 - val_loss: 1.4347 - val_acc: 0.4988
Epoch 9/50
 - 107s - loss: 1.4908 - acc: 0.4860 - val_loss: 1.4504 - val_acc: 0.4975
Epoch 10/50
 - 108s - loss: 1.4941 - acc: 0.4883 - val_loss: 1.4687 - val_acc: 0.5162
Epoch 11/50
 - 108s - loss: 1.4931 - acc: 0.4893 - val_loss: 1.5494 - val_acc: 0.4850
Epoch 12/50
 - 110s - loss: 1.4777 - acc: 0.4862 - val_loss: 1.

<keras.callbacks.History at 0x1f515d7d208>

In [65]:
model.save_weights('cnn_try_2.h5')