In [1]:
# Mounting google drive
from google.colab import drive

drive.mount('/content/gdrive')



Mounted at /content/gdrive


In [2]:
# Imports

import numpy as np
import librosa
from sklearn.model_selection import train_test_split
import os
import math
import json
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

## **Extracting and storing MFCCs from each audio signal**

In [3]:
# Defining constants

dataset = '/content/gdrive/MyDrive/Data/genres_original'
sample_rate = 22050
duration = 30 #secs
samples_per_track = sample_rate * duration




In [None]:
# Funtion to extract mfccs from each segment of each audio file and store them in a json file

def mfcc_extract(dataset, num_mfcc = 13, n_fft = 2048, hop_length = 512, num_segments = 5):

  data = { "classes":[], "mfccs":[],"labels":[] }

  samples_per_segment = int(samples_per_track/num_segments)
  num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

  for i,(root,dirs,files) in enumerate(os.walk(dataset)):

    if root is not dataset:
      
      # Extracting class name from directory path and storing it in the dictionary
      label = root.split('/')[-1]
      data['classes'].append(label)

      # Extracting MFCCs from each audio file segment
      for f in files:

        file_path = os.path.join(root,f)
        signal,sr = librosa.load(file_path,sr = sample_rate)

        for s in range(num_segments):

          # Find starting and ending points of each segment in the audio file
          start = samples_per_segment * s
          end = start + samples_per_segment

          # Extract mfcc for current segment 
          mfcc = librosa.feature.mfcc(signal[start:end], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length).T

          # Ensuring that shape of mfccs is fixed to avoid problems when feeding the data into the network
          if len(mfcc) == num_mfcc_vectors_per_segment:
              data["mfccs"].append(mfcc.tolist())
              data["labels"].append(i-1)
  
  with open('/content/gdrive/MyDrive/Data/mfcc.json','w') as f:
    json.dump(data, f)



mfcc_extract(dataset)

### **Artificial Neural Network for Music genre classification**

In [4]:
# Loading the data from the mfcc.json file

with open('/content/gdrive/MyDrive/Data/mfcc.json','r') as f:
  data = json.load(f)

# Removing unwanted classes 
data['classes'].remove('.ipynb_checkpoints')
print(data['classes'])
np.unique(data['labels'])


['rock', 'blues', 'reggae', 'disco', 'pop', 'hiphop', 'metal', 'country', 'classical', 'jazz']


array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
# Splitting data into train and test sets

X = np.array(data['mfccs'])
y = np.array(data['labels'])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

#Printing shapes for sanity check
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)





(3495, 259, 13)
(1498, 259, 13)
(3495,)
(1498,)


In [None]:
# Building the model

model = keras.Sequential([
        
        keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])),
        keras.layers.Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
        keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
        keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
        keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_9 (Flatten)          (None, 3367)              0         
_________________________________________________________________
dense_39 (Dense)             (None, 512)               1724416   
_________________________________________________________________
dense_40 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_41 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_42 (Dense)             (None, 10)                650       
Total params: 1,872,842
Trainable params: 1,872,842
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training the model

model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7efd44a1c490>

## **CNN for Music genre classification**

In [None]:
# Obtaining the inputs and outputs

X = np.array(data['mfccs'])
y = np.array(data['labels'])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

# Adding another dimension to the inputs as CNNs accept inputs of shape [w,h,num_channels]
X_train = X_train[... , np.newaxis]
X_test = X_test[... , np.newaxis]

# Sanity check
print(X_test.shape)
print(X_train.shape)
print(y_test.shape)
print(y_train.shape)


(1498, 259, 13, 1)
(3495, 259, 13, 1)
(1498,)
(3495,)


In [None]:
# Building the CNN model

cnn_model = keras.Sequential([
                              
            keras.layers.Conv2D(32,[3,3],activation='relu',input_shape = [X_train.shape[1],X_train.shape[2],X_train.shape[3]]),
            keras.layers.MaxPooling2D([3,3],strides=(2,2),padding='same'),
            keras.layers.BatchNormalization(),

            keras.layers.Conv2D(64,[2,2],activation='relu'),
            keras.layers.MaxPooling2D([3,3],strides=[2,2],padding='same'),
            keras.layers.BatchNormalization(),

            keras.layers.Conv2D(64,[2,2],activation='relu'),
            keras.layers.MaxPooling2D([2,2],strides=[2,2],padding='same'),
            keras.layers.BatchNormalization(),

            keras.layers.Flatten(),
            

            keras.layers.Dense(64,activation='relu',kernel_regularizer= regularizers.l2(1e-4)),
            keras.layers.Dropout(0.4),

            keras.layers.Dense(10,activation='softmax')

])
optimizer = keras.optimizers.Adam(learning_rate=0.0001)
cnn_model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

cnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 257, 11, 32)       320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 129, 6, 32)        0         
_________________________________________________________________
batch_normalization (BatchNo (None, 129, 6, 32)        128       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 128, 5, 64)        8256      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 3, 64)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64, 3, 64)         256       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 63, 2, 64)         1

In [None]:
# Training the model 

cnn_model.fit(X_train,y_train,validation_split=0.2,batch_size=32,epochs=15) 

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fbb90438f10>

In [None]:
# Evaluating the CNN model on the test set

test_accuracy = cnn_model.evaluate(X_test,y_test,verbose=2)

47/47 - 1s - loss: 1.3535 - accuracy: 0.6662


# **LSTM for Music genre classification**

In [5]:
# Gathering inputs and splitting data

X = np.array(data['mfccs'])
y = np.array(data['labels'])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

#Sanity check
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3495, 259, 13)
(1498, 259, 13)
(3495,)
(1498,)


In [6]:
# Building the LSTM model

lstm_model = keras.Sequential([
                      
              keras.layers.LSTM(64,input_shape=[X_train.shape[1],X_train.shape[2]],return_sequences=True),
              
              keras.layers.LSTM(64),

              keras.layers.Dense(64,activation='relu'),
              keras.layers.Dropout(0.3),

              keras.layers.Dense(10,activation='softmax')
])

lstm_model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

lstm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 259, 64)           19968     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                650       
Total params: 57,802
Trainable params: 57,802
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Training the LSTM model
lstm_model.fit(X_train,y_train,validation_split=0.2,epochs=15,batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f57f7845690>

In [8]:
# Evaluating the LSTM model on the test set
lstm_model.evaluate(X_test,y_test)



[1.2447290420532227, 0.5500667691230774]