In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

df = pd.read_csv('../input/bangla-music-dataset/dataset.csv')
df.head()

In [None]:
df.columns

## Definitions of features:

**1. Zero crossing:** A zero-crossing is a point where the sign of a mathematical function changes (e.g. from positive to negative), represented by a intercept of the axis (zero value) in the graph of the function.

**2. Spectral Centroid:** The spectral centroid indicates where the center of mass of the spectrum is located.

**3. Spectral Rolloff:** Spectral rolloff is the frequency below which a specified percentage of the total spectral energy, e.g. 85%, lies.

**4. Spectral Bandwidth:** The spectral bandwidth is defined as the width of the band of light at one-half the peak maximum (or full width at half maximum) and is represented by the two vertical red lines and λSB on the wavelength axis.

**5. Chroma Frequency:** The chroma feature is a descriptor, which represents the tonal content of a musical audio signal in a condensed form. Therefore chroma features can be considered as important prerequisite for high-level semantic analysis, like chord recognition or harmonic similarity estimation.

**6. RMSE:** Root Mean Square Error

**7. Delta:** In music, a major seventh chord is a seventh chord in which the third is a major third above the root and the seventh is a major seventh above the root. The major seventh chord, sometimes also called a Delta chord.

**8. MelSpectrogram:** MelSpectrogram represents an acoustic time-frequency representation of a sound: the power spectral density P(f, t). It is sampled into a number of points around equally spaced times ti and frequencies fj (on a Mel frequency scale).

**9. Tempo:** The speed at which a passage of music is or should be played.

**10. MFCC:** MFCC is the well known timbre texture feature or spectrum features which is the highest performing individual feature used in speech recognition, can be examined for modeling of music.

The definitions have been collected from [wikipedia](http://www.wikipedia.org/), [researchgate](http://researchgate.net) and *google* search results.

## Feature Preprocessing:

In [None]:
df.info()

There is no missing value in the dataframe. Now let us find the labels.

In [None]:
df['label'].unique()

There are 6 labels here. Now we shall convert these labels into 1 to 6.

In [None]:
df['label'] = df['label'].map({'adhunik':1, 'band':2, 'hiphop':3, 'nazrul':4, 'palligeeti':5, 'rabindra':6})

In [None]:
df['label'].unique()

Except the track names, all the features are numerical. Let us have a look at the summary of all the features.

In [None]:
df.describe()

## Visualization:

For the sake of visualization and easy understanding, now we will make 6 seperate dataframes according to the labels.

In [None]:
df_adhunik = df[df['label']==1]
df_band = df[df['label']==2]
df_hiphop = df[df['label']==3]
df_nazrul = df[df['label']==4]
df_palli = df[df['label']==5]
df_rabindra = df[df['label']==6]

### Zero Crossing:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 100
alpha = 0.8

axs[0,0].hist(df_adhunik['zero_crossing'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['zero_crossing'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['zero_crossing'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['zero_crossing'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['zero_crossing'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['zero_crossing'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
from matplotlib import cm

data_list = [df_adhunik, df_band, df_hiphop, df_nazrul, df_palli, df_rabindra]
classes = np.array(['Adhunik', 'Band', 'Hiphop', 'Nazrul', 'Palligeeti', 'Rabindra'])

In [None]:
col_name = 'zero_crossing'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### Spectral Centroid:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 80
alpha = 0.8

axs[0,0].hist(df_adhunik['spectral_centroid'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['spectral_centroid'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['spectral_centroid'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['spectral_centroid'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['spectral_centroid'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['spectral_centroid'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'spectral_centroid'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### Spectral Rolloff:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 80
alpha = 0.8

axs[0,0].hist(df_adhunik['spectral_rolloff'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['spectral_rolloff'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['spectral_rolloff'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['spectral_rolloff'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['spectral_rolloff'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['spectral_rolloff'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'spectral_rolloff'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### Spectral Bandwidth:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 80
alpha = 0.8

axs[0,0].hist(df_adhunik['spectral_bandwidth'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['spectral_bandwidth'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['spectral_bandwidth'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['spectral_bandwidth'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['spectral_bandwidth'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['spectral_bandwidth'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'spectral_bandwidth'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### Chroma Frequency:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 80
alpha = 0.8

axs[0,0].hist(df_adhunik['chroma_frequency'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['chroma_frequency'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['chroma_frequency'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['chroma_frequency'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['chroma_frequency'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['chroma_frequency'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'chroma_frequency'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### RMSE:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 80
alpha = 0.8

axs[0,0].hist(df_adhunik['rmse'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['rmse'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['rmse'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['rmse'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['rmse'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['rmse'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'rmse'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### Delta:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 80
alpha = 0.8

axs[0,0].hist(df_adhunik['delta'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['delta'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['delta'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['delta'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['delta'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['delta'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'delta'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### Melspectogram:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 80
alpha = 0.8

axs[0,0].hist(df_adhunik['melspectogram'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['melspectogram'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['melspectogram'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['melspectogram'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['melspectogram'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['melspectogram'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'melspectogram'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

### Tempo:

In [None]:
fig, axs = plt.subplots(3, 2, sharey=True, tight_layout=True, figsize=(15,15))
bins = 20
alpha = 0.8

axs[0,0].hist(df_adhunik['tempo'], bins=bins, alpha=alpha, color='grey')
axs[0,1].hist(df_band['tempo'], bins=bins, alpha=alpha, color='red')
axs[1,0].hist(df_hiphop['tempo'], bins=bins, alpha=alpha, color='teal')
axs[1,1].hist(df_nazrul['tempo'], bins=bins, alpha=alpha, color='blue')
axs[2,0].hist(df_palli['tempo'], bins=bins, alpha=alpha, color='purple')
axs[2,1].hist(df_rabindra['tempo'], bins=bins, alpha=alpha, color='orange')

axs[0,0].title.set_text('Adhunik')
axs[0,1].title.set_text('Band')
axs[1,0].title.set_text('Hiphop')
axs[1,1].title.set_text('Nazrul')
axs[2,0].title.set_text('Palli')
axs[2,1].title.set_text('Rabindra')

In [None]:
col_name = 'tempo'
print(col_name)
y = []
for data in data_list:
    print(data[col_name].mean())
    y.append(data[col_name].mean())
y = np.array(y)
colors = cm.hsv(y / float(max(y)))
plot = plt.scatter(y, y, c = y, cmap = 'hsv')
plt.clf()
plt.colorbar(plot)
plt.bar(classes, y, color = colors, width=0.6)
plt.show()

## Classification:

In [None]:
# df = df.sample(frac=1).reset_index(drop=True)
# df.head(20)
# np.random.seed(0)
# df = df.reindex(np.random.permutation(df.index))

In [None]:
X = df.loc[:, (df.columns != 'file_name') & (df.columns != 'label')]
y = df['label']

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=20, stratify=y)

# gbc = GradientBoostingClassifier()

# parameters = {'learning_rate':[.01, .1, 1, 10], 'n_estimators':[150, 200, 300, 500], 'max_depth':[3,4,5]}

# clf = GridSearchCV(gbc, parameters)

clf = GradientBoostingClassifier(learning_rate=.1, n_estimators=350, max_depth=6, random_state=20)

clf.fit(X_train, y_train)
# clf.score(X_test, y_test)
y_pred = clf.predict(X_test)

target_names = ['Adhunik', 'Band', 'Hiphop', 'Nazrul', 'Palli', 'Rabindra']

print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

con_mat = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(con_mat)
sns.heatmap(df_cm, annot=True)