In [1]:
import pandas
import time
import numpy as np
import pdb
from tqdm import tqdm
import librosa

# Audio transformation

In [2]:
song_preview_dir = './dataset/song_preview/'

# importing song data
# song_df = pandas.read_csv('./dataset/deep_learning/song_data.csv', sep='\t')
# genre_df = pandas.read_csv('./dataset/deep_learning/genre_song_pair.csv', sep='\t')
# mood_df = pandas.read_csv('./dataset/deep_learning/mood_song_pair.csv', sep='\t')
# tempo_df = pandas.read_csv('./dataset/deep_learning/tempo_song_pair.csv', sep='\t')
# song_df.head()

# DL part

In [3]:
from keras import backend as K
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D
from keras.layers.convolutional import MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import ELU
from keras.utils.data_utils import get_file
from keras.layers import Input, Dense

import os.path

Using Theano backend.


In [4]:
def buildModelCNN(input_tensor=None, include_top=True):
    '''Instantiate the MusicTaggerCNN architecture,
    optionally loading weights pre-trained
    on Million Song Dataset. Note that when using TensorFlow,
    for best performance you should set
    `image_dim_ordering="tf"` in your Keras config
    at ~/.keras/keras.json.

    The model and the weights are compatible with both
    TensorFlow and Theano. The dimension ordering
    convention used by the model is the one
    specified in your Keras config file.

    For preparing mel-spectrogram input, see
    `audio_conv_utils.py` in [applications](https://github.com/fchollet/keras/tree/master/keras/applications).
    You will need to install [Librosa](http://librosa.github.io/librosa/)
    to use it.

    # Arguments
        weights: one of `None` (random initialization)
            or "msd" (pre-training on ImageNet).
        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
            to use as image input for the model.
        include_top: whether to include the 1 fully-connected
            layer (output layer) at the top of the network.
            If False, the network outputs 256-dim features.


    # Returns
        A Keras model instance.
    '''
#     if weights not in {'msd', None}:
#         raise ValueError('The `weights` argument should be either '
#                          '`None` (random initialization) or `msd` '
#                          '(pre-training on Million Song Dataset).')

    K.set_image_dim_ordering('th')

    # Determine proper input shape
#     if K.image_dim_ordering() == 'th':
    input_shape = (1, 96, 1366)
#         # raise RuntimeError("th")
#     else:
#         input_shape = (96, 1366, 1)
#         # raise RuntimeError("tf")

#     if input_tensor is None:
    melgram_input = Input(shape=input_shape)
#     else:
#         if not K.is_keras_tensor(input_tensor):
#             melgram_input = Input(tensor=input_tensor, shape=input_shape)
#         else:
#             melgram_input = input_tensor

    # Determine input axis
#     if K.image_dim_ordering() == 'th':
    channel_axis = 1
    freq_axis = 2
    time_axis = 3
#     else:
#         channel_axis = 3
#         freq_axis = 1
#         time_axis = 2

    # Input block
    x = BatchNormalization(axis=freq_axis, name='bn_0_freq')(melgram_input)

    # Conv block 1
    x = Convolution2D(64, 3, 3, border_mode='same', name='conv1')(x)
    x = BatchNormalization(axis=channel_axis, mode=0, name='bn1')(x)
    x = ELU()(x)
    x = MaxPooling2D(pool_size=(2, 4), name='pool1')(x)

    # Conv block 2
    x = Convolution2D(128, 3, 3, border_mode='same', name='conv2')(x)
    x = BatchNormalization(axis=channel_axis, mode=0, name='bn2')(x)
    x = ELU()(x)
    x = MaxPooling2D(pool_size=(2, 4), name='pool2')(x)

    # Conv block 3
    x = Convolution2D(128, 3, 3, border_mode='same', name='conv3')(x)
    x = BatchNormalization(axis=channel_axis, mode=0, name='bn3')(x)
    x = ELU()(x)
    x = MaxPooling2D(pool_size=(2, 4), name='pool3')(x)

    # Conv block 4
    x = Convolution2D(128, 3, 3, border_mode='same', name='conv4')(x)
    x = BatchNormalization(axis=channel_axis, mode=0, name='bn4')(x)
    x = ELU()(x)
    x = MaxPooling2D(pool_size=(3, 5), name='pool4')(x)

    # Conv block 5
    x = Convolution2D(64, 3, 3, border_mode='same', name='conv5')(x)
    x = BatchNormalization(axis=channel_axis, mode=0, name='bn5')(x)
    x = ELU()(x)
    x = MaxPooling2D(pool_size=(4, 4), name='pool5')(x)

    # Output
    x = Flatten()(x)
    if include_top:
        x = Dense(50, activation='sigmoid', name='output')(x)

    # Create model
    model = Model(melgram_input, x)
#     if weights is None:
    return model    
#     else: 
#         # weights used by MSD
#         if K.image_dim_ordering() == 'tf':
#             raise RuntimeError("Please set image_dim_ordering == 'th'."
#                                "You can set it at ~/.keras/keras.json")
#         model.load_weights('data/music_tagger_cnn_weights_%s.h5' % K._BACKEND,
#                            by_name=True)
#         return model


In [5]:
# for converting audio file to melgram data
def _compute_melgram(audio_path):
    ''' Compute a mel-spectrogram and returns it in a shape of (1,1,96,1366), where
    96 == #mel-bins and 1366 == #time frame

    parameters
    ----------
    audio_path: path for the audio file.
                Any format supported by audioread will work.
    More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load

    '''

    # mel-spectrogram parameters
    SR = 12000
    N_FFT = 512
    N_MELS = 96
    HOP_LEN = 256
    DURA = 29.12  # to make it 1366 frame..

    src, sr = librosa.load(audio_path, sr=SR)  # whole signal
    n_sample = src.shape[0]
    n_sample_fit = int(DURA*SR)

    if n_sample < n_sample_fit:  # if too short
        src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
    elif n_sample > n_sample_fit:  # if too long
        src = src[int((n_sample-n_sample_fit)/2):int((n_sample+n_sample_fit)/2)]
    # logam = librosa.logamplitude
    logam = librosa.power_to_db
    melgram = librosa.feature.melspectrogram
    ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS)**2,
                ref=1.0)
    ret = ret[np.newaxis, np.newaxis, :]
    return ret

In [6]:
def _dataset_to_array():
    features = np.zeros((0, 1, 96, 1366))
    
    df = pandas.read_csv('./dataset/deep_learning/label_map(boolean_mood).csv', sep='\t')
    df = df.reindex(np.random.permutation(df.index)).head(1000)
    label_matrix = df.copy()
    label_matrix = label_matrix.drop(['track_id', 'song_id', 'title', 'preview_file'], axis=1).as_matrix()
    for idx,row in tqdm(df.iterrows(), "converting song audio file to features"):
#         print song_preview_dir+row['preview_file']
        melgram = _compute_melgram(song_preview_dir+row['preview_file'])
#         features.append(melgram)
#         features.append(melgram[0])
        features = np.concatenate((features, melgram), axis=0)
        
#         label = row['labels'].split(' ')
#         labels.append(label)
    np.save('song_features-1000.npy', features)
    np.save('song_labels-1000.npy', label_matrix)
    return features, label_matrix

In [7]:
model = buildModelCNN()



### Train

In [8]:
import keras
from sklearn.model_selection import train_test_split
split_ratio=0.8
random_state=7

# feature_path = 'song_features-1000.npy'
# label_path = 'song_labels-1000.npy'
# if (os.path.isfile(feature_path) and os.path.isfile(label_path)):
#     X = np.load(feature_path)
#     Y = np.load(label_path)
# else:
#     X, Y = _dataset_to_array()
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= (1 - split_ratio), random_state=random_state, shuffle=True)
# np.save('1000x_train.npy', X_train)
# np.save('1000y_train.npy', Y_train)
# np.save('1000x_test.npy', X_test)
# np.save('1000y_test.npy', Y_test)

if (os.path.isfile('1000x_train.npy') and os.path.isfile('1000y_train.npy') and os.path.isfile('1000x_test.npy') and os.path.isfile('1000y_test.npy')):
    X_train = np.load('1000x_train.npy')
    Y_train = np.load('1000y_train.npy')
    X_test = np.load('1000x_test.npy')
    Y_test = np.load('1000y_test.npy')
else:
    feature_path = 'song_features-1000.npy'
    label_path = 'song_labels-1000.npy'
    if (os.path.isfile(feature_path) and os.path.isfile(label_path)):
        X = np.load(feature_path)
        Y = np.load(label_path)
    else:
        X, Y = _dataset_to_array()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= (1 - split_ratio), random_state=random_state, shuffle=True)
    np.save('1000x_train.npy', X_train)
    np.save('1000y_train.npy', Y_train)
    np.save('1000x_test.npy', X_test)
    np.save('1000y_test.npy', Y_test)

In [9]:
print(X_train.shape)

(800, 1, 96, 1366)


In [10]:
# melgrams = np.zeros((0, 1, 96, 1366))
# a = _compute_melgram('/home/capt4ce/projects/major_project/dataset/song_preview/TRAAAAW128F429D538-mzm.jmksdiul.aac.p.m4a')
# print a.shape
# melgrams = np.concatenate((melgrams, a), axis=0)
# a = _compute_melgram('/home/capt4ce/projects/major_project/dataset/song_preview/TRAAABD128F429CF47-mzm.maejowgk.aac.p.m4a')
# print a.shape
# melgrams = np.concatenate((melgrams, a), axis=0)
# melgrams.shape
# print(Y_train)

In [11]:
model_path = 'DLModel.h5'
channel = 1
epochs = 20#50
batch_size = 10
verbose = 1

model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adadelta(),
                  metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=(X_test, Y_test))
# model.save(model_path)
model.save_weights(model_path)

Train on 800 samples, validate on 200 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
