# Music Classification via deep neural network
#### **Overview:** This notebook serves to analyze music audio files labeled by genre and develop a deep neural network to predict the music genre based on numerical features of the audio file. 
## Contents
### **Generate Features**
#### **30 second segments**
#### **3 second segments**
### **Modeling the Data** (Run from here if you have the CSV file(s))
#### **30 second segments**
##### Read and process the data
##### Split into training and test data
##### Set up the model and metrics analysis 
##### Train the model
##### Results
#### **3 second segments**
##### Split into training and test data
##### Set up the model
##### Train the model
##### Results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

from glob import glob
import librosa
import librosa.display

# Generate Features

## 30 second segments

In [None]:
num_segment = 1
num_mfcc = 20
sample_rate = 22050
n_fft = 2048
hop_length = 512
my_csv = {
    "filename": [],
    "chroma_stft_mean": [],
    "chroma_stft_var": [],
    "rms_mean": [],
    "rms_var": [],
    "spectral_centroid_mean": [],
    "spectral_centroid_var": [],
    "spectral_bandwidth_mean": [],
    "spectral_bandwidth_var": [],
    "rolloff_mean": [],
    "rolloff_var": [],
    "zero_crossing_rate_mean": [],
    "zero_crossing_rate_var": [],
    "harmony_mean": [],
    "harmony_var": [],
    "perceptr_mean": [],
    "perceptr_var": [],
    "tempo": [],
    "mfcc1_mean": [],
    "mfcc1_var": [],
    "mfcc2_mean": [],
    "mfcc2_var": [],
    "mfcc3_mean": [],
    "mfcc3_var": [],
    "mfcc4_mean": [],
    "mfcc4_var": [],
    "mfcc5_mean": [],
    "mfcc5_var": [],
    "mfcc6_mean": [],
    "mfcc6_var": [],
    "mfcc7_mean": [],
    "mfcc7_var": [],
    "mfcc8_mean": [],
    "mfcc8_var": [],
    "mfcc9_mean": [],
    "mfcc9_var": [],
    "mfcc10_mean": [],
    "mfcc10_var": [],
    "mfcc11_mean": [],
    "mfcc11_var": [],
    "mfcc12_mean": [],
    "mfcc12_var": [],
    "mfcc13_mean": [],
    "mfcc13_var": [],
    "mfcc14_mean": [],
    "mfcc14_var": [],
    "mfcc15_mean": [],
    "mfcc15_var": [],
    "mfcc16_mean": [],
    "mfcc16_var": [],
    "mfcc17_mean": [],
    "mfcc17_var": [],
    "mfcc18_mean": [],
    "mfcc18_var": [],
    "mfcc19_mean": [],
    "mfcc19_var": [],
    "mfcc20_mean": [],
    "mfcc20_var": [],
    "label": [],
}
my_3_csv = my_csv.copy()

In [None]:
dataset_path = "genres"
audio_files = glob(dataset_path + "/*/*")
genre = glob(dataset_path + "/*")
n_genres = len(genre)
genre = [genre[x].split("/")[-1] for x in range(n_genres) if ".mf" not in genre[x]]
print(genre)

In [None]:
genre = ""
for f in sorted(audio_files):
    if genre != f.split("/")[-2]:
        genre = f.split("/")[-2]
        print("Procesassing " + genre + "...")
    fname = f.split("/")[-1]
    try:
        y, sr = librosa.load(f, sr=sample_rate)
    except:
        continue

    # Chromagram
    chroma_hop_length = 512  # 5000?
    chromagram = librosa.feature.chroma_stft(
        y=y, sr=sample_rate, hop_length=chroma_hop_length
    )
    my_csv["chroma_stft_mean"].append(chromagram.mean())
    my_csv["chroma_stft_var"].append(chromagram.var())

    # Root Mean Square Energy
    RMSEn = librosa.feature.rms(y=y)
    my_csv["rms_mean"].append(RMSEn.mean())
    my_csv["rms_var"].append(RMSEn.var())

    # Spectral Centroid
    spec_cent = librosa.feature.spectral_centroid(y=y)
    my_csv["spectral_centroid_mean"].append(spec_cent.mean())
    my_csv["spectral_centroid_var"].append(spec_cent.var())

    # Spectral Bandwith
    spec_band = librosa.feature.spectral_bandwidth(y=y, sr=sample_rate)
    my_csv["spectral_bandwidth_mean"].append(spec_band.mean())
    my_csv["spectral_bandwidth_var"].append(spec_band.var())

    # Rolloff
    spec_roll = librosa.feature.spectral_rolloff(y=y, sr=sample_rate)
    my_csv["rolloff_mean"].append(spec_roll.mean())
    my_csv["rolloff_var"].append(spec_roll.var())

    # Zero Crossing Rate
    zero_crossing = librosa.feature.zero_crossing_rate(y=y)
    my_csv["zero_crossing_rate_mean"].append(zero_crossing.mean())
    my_csv["zero_crossing_rate_var"].append(zero_crossing.var())

    # Harmonics and Perceptrual
    harmony, perceptr = librosa.effects.hpss(y=y)
    my_csv["harmony_mean"].append(harmony.mean())
    my_csv["harmony_var"].append(harmony.var())
    my_csv["perceptr_mean"].append(perceptr.mean())
    my_csv["perceptr_var"].append(perceptr.var())

    # Tempo
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    my_csv["tempo"].append(tempo)

    mfcc = librosa.feature.mfcc(
        y=y, sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length
    )
    mfcc = mfcc.T

    my_csv["filename"].append(fname)
    my_csv["label"].append(f.split("/")[-2])
    for x in range(20):
        feat1 = "mfcc" + str(x + 1) + "_mean"
        feat2 = "mfcc" + str(x + 1) + "_var"
        my_csv[feat1].append(mfcc[:, x].mean())
        my_csv[feat2].append(mfcc[:, x].var())
    print(fname)

In [None]:
df = pd.DataFrame(my_csv)
df.to_csv("myfeatures.csv", index=False)

## 3 second segments

In [None]:
num_mfcc = 20
sample_rate = 22050
n_fft = 2048
hop_length = 512
num_segment = 10
samples_per_segment = int(sample_rate * 30 / num_segment)
dataset_path = "genres"
audio_files = glob(dataset_path + "/*/*")
genre = glob(dataset_path + "/*")
n_genres = len(genre)
genre = [genre[x].split("/")[-1] for x in range(n_genres)]

In [None]:
num_segment = 10

samples_per_segment = int(sample_rate * 30 / num_segment)
genre = ""
for f in sorted(audio_files):
    if genre != f.split("/")[-2]:
        genre = f.split("/")[-2]
        print("Procesassing " + genre + "...")
    fname = f.split("/")[-1]
    # print(fname)
    try:
        y, sr = librosa.load(f, sr=sample_rate)
    except:
        continue

    for n in range(num_segment):
        y_seg = y[samples_per_segment * n : samples_per_segment * (n + 1)]
        # Chromagram
        chroma_hop_length = 512
        chromagram = librosa.feature.chroma_stft(
            y=y_seg, sr=sample_rate, hop_length=chroma_hop_length
        )
        my_3_csv["chroma_stft_mean"].append(chromagram.mean())
        my_3_csv["chroma_stft_var"].append(chromagram.var())

        # Root Mean Square Energy
        RMSEn = librosa.feature.rms(y=y_seg)
        my_3_csv["rms_mean"].append(RMSEn.mean())
        my_3_csv["rms_var"].append(RMSEn.var())

        # Spectral Centroid
        spec_cent = librosa.feature.spectral_centroid(y=y_seg)
        my_3_csv["spectral_centroid_mean"].append(spec_cent.mean())
        my_3_csv["spectral_centroid_var"].append(spec_cent.var())

        # Spectral Bandwith
        spec_band = librosa.feature.spectral_bandwidth(y=y_seg, sr=sample_rate)
        my_3_csv["spectral_bandwidth_mean"].append(spec_band.mean())
        my_3_csv["spectral_bandwidth_var"].append(spec_band.var())

        # Rolloff
        spec_roll = librosa.feature.spectral_rolloff(y=y_seg, sr=sample_rate)
        my_3_csv["rolloff_mean"].append(spec_roll.mean())
        my_3_csv["rolloff_var"].append(spec_roll.var())

        # Zero Crossing Rate
        zero_crossing = librosa.feature.zero_crossing_rate(y=y_seg)
        my_3_csv["zero_crossing_rate_mean"].append(zero_crossing.mean())
        my_3_csv["zero_crossing_rate_var"].append(zero_crossing.var())

        # Harmonics and Perceptrual
        harmony, perceptr = librosa.effects.hpss(y=y_seg)
        my_3_csv["harmony_mean"].append(harmony.mean())
        my_3_csv["harmony_var"].append(harmony.var())
        my_3_csv["perceptr_mean"].append(perceptr.mean())
        my_3_csv["perceptr_var"].append(perceptr.var())

        # Tempo
        tempo, _ = librosa.beat.beat_track(y=y_seg, sr=sample_rate)
        my_3_csv["tempo"].append(tempo)

        mfcc = librosa.feature.mfcc(
            y=y_seg, sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length
        )
        mfcc = mfcc.T

        fseg_name = ".".join(fname.split(".")[:2]) + f".{n}.wav"
        my_3_csv["filename"].append(fseg_name)
        my_3_csv["label"].append(genre)
        for x in range(20):
            feat1 = "mfcc" + str(x + 1) + "_mean"
            feat2 = "mfcc" + str(x + 1) + "_var"
            my_3_csv[feat1].append(mfcc[:, x].mean())
            my_3_csv[feat2].append(mfcc[:, x].var())
    print(fname)

In [None]:
df = pd.DataFrame(my_3_csv)
df.to_csv("myfeatures_3_sec.csv", index=False)

In [None]:
df.shape

In [None]:
df.head()

# Modeling the data (Run from here if you have the CSV file(s))

## 30 second segments

### Read and process the data

In [None]:
df = pd.read_csv("myfeatures.csv", index_col=None)
df_orig = df.copy()
df

In [None]:
if df.isna().sum().sum() == 0:
    print("No NaNs to clean.")

In [None]:
df = df.drop(["filename", "label"], axis=1)
df = (df - df.mean()) / df.std()
df

In [None]:
df["label"] = LabelEncoder().fit_transform(df_orig["label"])
df

In [None]:
means = df.groupby("label").mean()
means

In [None]:
plt.bar(x=df_orig["label"].unique(), height=means.iloc[:, 0])
plt.xticks(rotation="vertical")
plt.ylabel(means.columns[0] + " (normalized)")
plt.show()

In [None]:
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])

In [None]:
print("X shape:", X.shape)
print("y shape:", y.shape)

### Split into training and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### Set up model and metrics analysis

In [None]:
def generate_model(input_shape):
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(512, activation="relu", input_shape=input_shape),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation="softmax"),
        ]
    )
    print(model.summary())
    return model

In [None]:
def train_model(model, epochs, optimizer):
    batch_size = 128
    model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics="accuracy"
    )
    return model.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        epochs=epochs,
        batch_size=batch_size,
    )

In [None]:
def plot_validate(history):
    print("Max validation accuracy:", round(max(history.history["val_accuracy"]), 3))
    pd.DataFrame(history.history).plot()
    plt.xlabel("Epoch")
    plt.ylabel("Model Performance")
    plt.grid()
    plt.show()

In [None]:
input_shape = (X_train.shape[1],)
model = generate_model(input_shape)

### Train the model

In [None]:
model_history = train_model(model=model, epochs=100, optimizer="adam")

### Results

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=128)
print("Test loss:", round(test_loss, 3))
print("Best test accuracy:", round(test_acc, 3))

In [None]:
plot_validate(model_history)

## 3 Second Segments

### Read and process data

In [None]:
df = pd.read_csv("myfeatures_3_sec.csv", index_col=None)
df_orig = df.copy()
df

In [None]:
if df.isna().sum().sum() == 0:
    print("No NaNs to clean.")

In [None]:
df = df.drop(["filename", "label"], axis=1)
df = (df - df.mean()) / df.std()
df

In [None]:
df["label"] = LabelEncoder().fit_transform(df_orig["label"])
df

In [None]:
means = df.groupby("label").mean()
means

In [None]:
plt.bar(x=df_orig["label"].unique(), height=means.iloc[:, 0])
plt.xticks(rotation="vertical")
plt.ylabel(means.columns[0] + " (normalized)")
plt.show()

In [None]:
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])

In [None]:
print("X shape:", X.shape)
print("y shape:", y.shape)

### Split into training and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### Set up model

In [None]:
input_shape = (X_train.shape[1],)
model = generate_model(input_shape)

### Train the model

In [None]:
model_history = train_model(model=model, epochs=300, optimizer="adam")

### Results

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=128)
print("Test loss:", round(test_loss, 3))
print("Best test accuracy:", round(test_acc, 3))

In [None]:
plot_validate(model_history)