In [5]:
# Case Study: Classify songs into different genres
# Objective: Model a classifier to classify songs into different genres.
# Dataset: GITZAN dataset
# Preprocessing the data: Before training the classification model, we have to transform
#                           raw data from audio samples into more meaningful representations.
#                           The audio clips need to be converted from .au format to .wav format
#                           to make it compatible with python's wave module for reading audio
#                           files. If needed to convert, use sox input.au output.wav. 
#                           link: https://www.stefaanlippens.net/audio_conversion_cheat_sheet/
# Classification:
#   Feature Extraction: We need to extract meaningful features from audio files. To classify 
#                       audio clips, we will choose five features, i.e. Mel-Frequency Cepstral
#                       Coefficients, Spectral Centroid, Zero Crossing Rate, Chroma Frequencies,
#                       Spectral Roll-off. All of the features are then appended into a .csv file
#                       so that classification algorithms can be used.
#   Classification: Once the features have been extracted, we can use existing classification
#                   algorithms to classify the songs into different genres. You can either use the
#                   spectogram images directly for classification or you can extract the features
#                   and use the classification models on them. Usig a CNN model (on the spectogram
#                   images) gives a better accuracy.
# Source: https://gist.github.com/parulnith/7f8c174e6ac099e86f0495d3d9a4c01e

# Importing libraries
# Feature extracting and Preprocessing data
# feature extractoring and preprocessing data
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image
import pathlib
import csv

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Keras
import keras

import warnings
warnings.filterwarnings('ignore')

In [8]:
# Extracting music and features

# Extracting the Spectrogram for every audio. All the files get converted into their respective
# spectograms. We can easily extract features from them.
cmap = plt.get_cmap('inferno')

plt.figure(figsize=(10,10))
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
for g in genres:
    pathlib.Path(f'img_data/{g}').mkdir(parents=True, exist_ok=True)     
    for filename in os.listdir(f'./genres/{g}'):
        songname = f'./genres/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=5)
        plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
        plt.axis('off');
        plt.savefig(f'img_data/{g}/{filename[:-3].replace(".", "")}.png')
        plt.clf()

<Figure size 720x720 with 0 Axes>

In [10]:
# Extracting features from Spectogram
# We will extract:
#   Mel-frequency cepstral coefficients (MFCC)(20 in number)
#   Spectral Centroid
#   Zero Crossing Rate
#   Chroma Frequencies
#   Spectral Roll-off

header = 'filename chroma_stft rms spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()
print(header)

['filename', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20', 'label']


In [12]:
# Writing data to a csv file

file = open('data.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
for g in genres:
    for filename in os.listdir(f'./genres/{g}'):
        songname = f'./genres/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=30)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        rms = librosa.feature.rms(y=y)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rms)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append += f' {g}'
        file = open('data.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(to_append.split())

In [21]:
# Analysing the data in Pandas
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,blues.00093.wav,0.37769,0.065906,569.930721,995.407125,927.427725,0.021701,-350.436188,169.545746,31.82037,...,1.82169,-5.970891,-5.259567,-0.229211,-1.77685,-3.713751,0.181591,2.07239,-2.896225,blues
1,blues.00087.wav,0.336773,0.158098,1442.190271,1870.534155,3083.414688,0.050889,-155.504929,125.638863,1.596553,...,-0.792893,-7.748057,0.413548,-7.030262,3.997679,-6.256611,0.958227,2.019821,-5.742188,blues
2,blues.00050.wav,0.40086,0.18238,1945.848425,2082.246626,4175.874749,0.085806,-82.979019,107.052124,-25.320452,...,12.539581,-9.762303,2.562253,-6.300853,2.996785,-8.718454,-0.326581,-2.980347,0.712601,blues
3,blues.00044.wav,0.390212,0.136276,2279.124558,2375.10212,5198.360233,0.09257,-109.509285,86.922409,-8.607986,...,11.087481,-5.085794,3.97636,-12.859742,12.343859,0.026216,-0.741568,-5.12662,3.303442,blues
4,blues.00078.wav,0.414188,0.258052,2333.685108,2227.425609,4942.811778,0.123863,-2.524338,101.252716,-33.924385,...,12.506608,-13.368823,6.112817,-9.06589,5.033774,-11.330277,3.166534,-4.567591,-4.033622,blues


In [22]:
data.shape

(1000, 28)

In [23]:
# Dropping unnecessary columns
data = data.drop(['filename'],axis=1)

In [25]:
data.shape
data.head()

Unnamed: 0,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,0.37769,0.065906,569.930721,995.407125,927.427725,0.021701,-350.436188,169.545746,31.82037,16.682835,...,1.82169,-5.970891,-5.259567,-0.229211,-1.77685,-3.713751,0.181591,2.07239,-2.896225,blues
1,0.336773,0.158098,1442.190271,1870.534155,3083.414688,0.050889,-155.504929,125.638863,1.596553,45.80452,...,-0.792893,-7.748057,0.413548,-7.030262,3.997679,-6.256611,0.958227,2.019821,-5.742188,blues
2,0.40086,0.18238,1945.848425,2082.246626,4175.874749,0.085806,-82.979019,107.052124,-25.320452,57.124989,...,12.539581,-9.762303,2.562253,-6.300853,2.996785,-8.718454,-0.326581,-2.980347,0.712601,blues
3,0.390212,0.136276,2279.124558,2375.10212,5198.360233,0.09257,-109.509285,86.922409,-8.607986,64.49456,...,11.087481,-5.085794,3.97636,-12.859742,12.343859,0.026216,-0.741568,-5.12662,3.303442,blues
4,0.414188,0.258052,2333.685108,2227.425609,4942.811778,0.123863,-2.524338,101.252716,-33.924385,41.516888,...,12.506608,-13.368823,6.112817,-9.06589,5.033774,-11.330277,3.166534,-4.567591,-4.033622,blues


In [27]:
# Encoding the Labels

genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)

In [29]:
# Scaling the Feature Columns

scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype=float))

In [30]:
# Dividing data into training and Testing set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [33]:
len(y_train)

800

In [32]:
len(y_test)

200

In [34]:
X_train[10]

array([ 0.14450761, -0.5242746 ,  1.03846878,  1.11969883,  1.12002066,
        0.27079125,  0.13040426, -1.01642597,  1.92883368, -0.10208166,
       -0.17273638,  0.61266362, -0.0686508 ,  0.0114033 , -0.00765423,
       -0.68509775,  0.2422508 , -1.16213042, -0.24049213, -0.40162617,
        0.38951325, -0.6641459 ,  0.67908619,  0.11503152,  0.21058462,
       -0.01974234])

In [35]:
# Classification with Keras
# Relu - Applies the rectified linear unit activation function.
# With default values, this returns the standard ReLU activation: max(x, 0), the element-wise maximum of 0 and the input tensor.
# https://keras.io/api/layers/activations/
# Softmax - The softmax function, also known as softargmax[1]:184 or normalized exponential function,[2]:198 is a generalization of the logistic 
# function to multiple dimensions. It is used in multinomial logistic regression and is often used as the last activation function of a neural 
# network to normalize the output of a network to a probability distribution over predicted output classes, based on Luce's choice axiom.
# https://en.wikipedia.org/wiki/Softmax_function

# Building our Network
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

In [36]:
# Config the model with losses and metrics
# Optimizer that implements the Adam algorithm - https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam
# SparseCategoricalCrossentropy computes the crossentropy loss between the labels and predictions. Use this function
#   when there are two or more label classes. We expect labels to be provided as integers.
# Accuracy calculates how often predictions equal labels.  
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [37]:
# Train the model
# x_train = Input data, numpy array
# y_train = Target data, numpy array consistent with x
# epochs = integer. Number of epochs to train the model. An epoch is an iteration over the entire x and y data provided.
#           The model is not trained for a number of iterations given by epochs, but merely until the epoch of index
#           epochs is reached.
# batch_size = Integer or None. Number of samples per gradient update. 
history = model.fit(X_train, y_train, epochs=20, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [38]:
# Returns the loss value and metrics values for the model in test mode. Computation is done in batches.
# X_test = Input data
# y_test = Target data
# batch_size = Number of samples per batch of computation. If unspecified, will default to 32.
test_loss, test_acc = model.evaluate(X_test, y_test)

# Test accuracy is 0.6400, which hits at Overfitting



In [39]:
# Validating our approach

# Let's set apart 200 samples in our training data to use as a validation set.
x_val = X_train[:200]
partial_x_train = X_train[200:]

y_val = y_train[:200]
partial_y_train = y_train[200:]

In [40]:
# Now, let's train our network for 20 epochs
model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(partial_x_train,
          partial_y_train,
          epochs=30,
          batch_size=512,
          validation_data=(x_val, y_val))
results = model.evaluate(X_test, y_test)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [41]:
results

[1.1720991134643555, 0.6000000238418579]

In [None]:
# Predictions on Test Data
# Predict - Generates output predictions for the input samples.
#   Computation is done in batches.
# X_test - Input samples, numpy array
# batch_size - Number of samples per batch. If unspecified, will default to 32.
predictions = model.predict(X_test)