In [14]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pydotplus
import seaborn as sns
import sklearn
import sys

from graphviz import Source
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import classification_report, plot_confusion_matrix, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# Python ≥3.5 is required
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
assert sklearn.__version__ >= "0.20"

# To plot pretty figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.style.use('fivethirtyeight')

# to make this notebook's output stable across runs
np.random.seed(42)

RESULTS_PATH = os.path.join('./results')


# Getting the data & setup


In [15]:
DATA_PATH = '../MLP2/input/'

# loading the audio file data
# we probably wont use this until we know how to neural network
audio_df = pd.read_csv(DATA_PATH + 'audio_data.csv')
audio_means_df = pd.read_csv(
    DATA_PATH + 'audio_means.csv')

# loading the .csv file data
song_dropped_df = pd.read_csv(
    DATA_PATH + 'dropped.csv')
song_filled_df = pd.read_csv(
    DATA_PATH + 'mean_filled.csv')


In [16]:
song_dropped_df = song_dropped_df.drop(
    columns=['artist_name', 'track_name', 'instance_id'])
song_filled_df = song_filled_df.drop(columns=['artist_name', 'track_name'])


In [17]:
keys = dict()
i = 0
for key in sorted(song_filled_df['key'].unique()):
    keys[key] = i
    i += 1

modes = dict({song_filled_df['mode'].unique()[0]: 0,
             song_filled_df['mode'].unique()[1]: 1})

song_dropped_df['key'] = song_dropped_df['key'].apply(lambda x: keys[x])
song_dropped_df['mode'] = song_dropped_df['mode'].apply(lambda x: modes[x])

song_filled_df['key'] = song_filled_df['key'].apply(lambda x: keys[x])
song_filled_df['mode'] = song_filled_df['mode'].apply(lambda x: modes[x])


In [18]:
# getting a list of all genre names
audio_genres = audio_df['genre'].unique()
dropped_genres = song_dropped_df['music_genre'].unique()
filled_genres = song_filled_df['music_genre'].unique()

# getting a list of all features in each data frame
audio_features = audio_means_df.columns.drop('genre')
dropped_features = song_dropped_df.columns.drop('music_genre')
filled_features = song_filled_df.columns.drop('music_genre')


In [19]:
# audio data with means
X_audio = audio_means_df.loc[:, audio_means_df.columns != 'genre']
y_audio = audio_means_df['genre']

# song data with dropped null values
X_dropped = song_dropped_df.loc[:, song_dropped_df.columns != 'music_genre']
y_dropped = song_dropped_df['music_genre']

# song data with mean filled null values
X_filled = song_filled_df.loc[:, song_filled_df.columns != 'music_genre']
y_filled = song_filled_df['music_genre']


creating the initial splits


In [20]:
X_audio_train, X_audio_test, y_audio_train, y_audio_test = train_test_split(
    X_audio, y_audio, test_size=0.20)

X_dropped_train, X_dropped_test, y_dropped_train, y_dropped_test = train_test_split(
    X_dropped, y_dropped, test_size=0.30)

X_filled_train, X_filled_test, y_filled_train, y_filled_test = train_test_split(
    X_filled, y_filled, test_size=0.30)


Separating data from labels


In [21]:
print(len(X_audio))
print(len(X_dropped))
print(len(X_filled))


1000
40560
50000


Some methods do not work with the string labels, so we mapped the genre titles for each data set to unique integers


In [22]:
y_audio_nums = y_audio.copy()
y_audio_nums.replace(audio_genres, [x for x in range(10)], inplace=True)

y_dropped_nums = y_dropped.copy()
y_dropped_nums.replace(dropped_genres, [x for x in range(10)], inplace=True)

y_filled_nums = y_filled.copy()
y_filled_nums.replace(filled_genres, [x for x in range(10)], inplace=True)


# Neural Network Modeling


## Prepararation


In [23]:
audio_results_metrics = pd.DataFrame()
dropped_results_metrics = pd.DataFrame()
filled_results_metrics = pd.DataFrame()


In [24]:
# parameters:   pipe - model pipeline
#               params - dictionary of parameters specific to the model
#               X,y - X and y for the data set that should be used for cross validating
#               label - label of the data set
def run_grid_search(pipe, params, X, y, label, cv=None, n_jobs=-1):
    clf_GSCV = GridSearchCV(pipe, params, cv=cv)
    clf_GSCV.fit(X, y)
    df = pd.DataFrame(clf_GSCV.cv_results_)
    df['label'] = label
    return df, clf_GSCV

In [25]:
# defining the scaler we will be using
sc = StandardScaler()


## Training


In [41]:
nnclf = MLPClassifier()

nnclf_pipe = Pipeline(steps=[('sc', sc),
                             ('nnclf', nnclf)])

n_components = list(range(1, X_audio.shape[1] + 1, 1))

solvers = ['lbfgs']
activation_functions = ['logistic', 'tanh', 'relu']
hidden_layer_units = [[10, 10], [20, 20], [50, 50], [100, 100]]
alpha_units = [0.01, 0.1, 1.0, 5.0]


parameters = dict(nnclf__solver=solvers,
                  nnclf__activation=activation_functions,
                  nnclf__hidden_layer_sizes=hidden_layer_units,
                  nnclf__alpha=alpha_units)

nnclf_results_df = pd.DataFrame()
nnclf_results_df = nnclf_results_df.append(
    run_grid_search(nnclf_pipe, parameters, X_audio, y_audio, 'Audio', cv=3)[0]).append(
    run_grid_search(nnclf_pipe, parameters, X_dropped, y_dropped, 'Dropped', cv=10)[0]).append(
    run_grid_search(nnclf_pipe, parameters, X_filled, y_filled, 'Filled', cv=10)[0])

nnclf_results_df.reset_index(inplace=True, drop=True)

nnclf_results_df


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_nnclf__activation,param_nnclf__alpha,param_nnclf__hidden_layer_sizes,param_nnclf__solver,params,split0_test_score,...,std_test_score,rank_test_score,label,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score
0,0.154680,0.014848,0.004821,0.002704,logistic,0.01,"[10, 10]",lbfgs,"{'nnclf__activation': 'logistic', 'nnclf__alph...",0.338323,...,0.014833,33,Audio,,,,,,,
1,0.346776,0.030782,0.002999,0.000820,logistic,0.01,"[20, 20]",lbfgs,"{'nnclf__activation': 'logistic', 'nnclf__alph...",0.326347,...,0.006547,37,Audio,,,,,,,
2,0.553742,0.018766,0.003331,0.000945,logistic,0.01,"[50, 50]",lbfgs,"{'nnclf__activation': 'logistic', 'nnclf__alph...",0.350299,...,0.006666,25,Audio,,,,,,,
3,1.363669,0.010338,0.002999,0.000004,logistic,0.01,"[100, 100]",lbfgs,"{'nnclf__activation': 'logistic', 'nnclf__alph...",0.335329,...,0.000475,36,Audio,,,,,,,
4,0.137335,0.004021,0.002362,0.000514,logistic,0.1,"[10, 10]",lbfgs,"{'nnclf__activation': 'logistic', 'nnclf__alph...",0.368263,...,0.009427,23,Audio,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,59.459744,0.714444,0.017208,0.000600,relu,1.0,"[100, 100]",lbfgs,"{'nnclf__activation': 'relu', 'nnclf__alpha': ...",0.569000,...,0.005926,12,Filled,0.5690,0.5664,0.5834,0.5644,0.5736,0.5606,0.5666
140,10.671598,0.170543,0.006803,0.000404,relu,5.0,"[10, 10]",lbfgs,"{'nnclf__activation': 'relu', 'nnclf__alpha': ...",0.551600,...,0.006934,48,Filled,0.5520,0.5334,0.5578,0.5480,0.5478,0.5400,0.5486
141,17.462101,0.228473,0.007695,0.000641,relu,5.0,"[20, 20]",lbfgs,"{'nnclf__activation': 'relu', 'nnclf__alpha': ...",0.563200,...,0.006380,31,Filled,0.5616,0.5642,0.5792,0.5624,0.5654,0.5588,0.5576
142,33.140697,0.428181,0.011103,0.000539,relu,5.0,"[50, 50]",lbfgs,"{'nnclf__activation': 'relu', 'nnclf__alpha': ...",0.575400,...,0.008282,14,Filled,0.5716,0.5674,0.5886,0.5634,0.5646,0.5588,0.5686


In [47]:
best_nnclf_audio = nnclf_results_df[(nnclf_results_df.rank_test_score == 1) &
                                    (nnclf_results_df.label == 'Audio')]

best_nnclf_dropped = nnclf_results_df[(nnclf_results_df.rank_test_score == 1) &
                                      (nnclf_results_df.label == 'Dropped')]

best_nnclf_filled = nnclf_results_df[(nnclf_results_df.rank_test_score == 1) &
                                     (nnclf_results_df.label == 'Filled')]


In [48]:
print('best audio parameters:\n',
      '\n--solver--\n', best_nnclf_audio.param_nnclf__solver.values,
      '\n--activation--\n', best_nnclf_audio.param_nnclf__activation.values,
      '\n--hidden_layer_sizes--\n', best_nnclf_audio.param_nnclf__hidden_layer_sizes.values,
      '\n--alpha--\n', best_nnclf_audio.param_nnclf__alpha.values)

print('\n\nbest dropped parameters:\n',
      '\n--solver--\n', best_nnclf_dropped.param_nnclf__solver.values,
      '\n--activation--\n', best_nnclf_dropped.param_nnclf__activation.values,
      '\n--hidden_layer_sizes--\n', best_nnclf_dropped.param_nnclf__hidden_layer_sizes.values,
      '\n--solver--\n', best_nnclf_dropped.param_nnclf__alpha.values)

print('\n\nbest filled parameters:\n',
      '\n--solver--\n', best_nnclf_filled.param_nnclf__solver.values,
      '\n--activation--\n', best_nnclf_filled.param_nnclf__activation.values,
      '\n--hidden_layer_sizes--\n', best_nnclf_filled.param_nnclf__hidden_layer_sizes.values,
      '\n--alpha--\n', best_nnclf_filled.param_nnclf__alpha.values)


best audio parameters:
 
--solver--
 ['lbfgs' 'lbfgs'] 
--activation--
 ['tanh' 'relu'] 
--hidden_layer_sizes--
 [list([20, 20]) list([100, 100])] 
--alpha--
 [5.0 5.0]


best dropped parameters:
 
--solver--
 ['lbfgs'] 
--activation--
 ['tanh'] 
--hidden_layer_sizes--
 [list([50, 50])] 
--solver--
 [5.0]


best filled parameters:
 
--solver--
 ['lbfgs'] 
--activation--
 ['tanh'] 
--hidden_layer_sizes--
 [list([50, 50])] 
--alpha--
 [5.0]


In [49]:
nnclf_pipe = Pipeline(steps=[('nnclf', nnclf)])

nnclf_audio = nnclf_pipe
nnclf_audio.set_params(**best_nnclf_audio.params.values[0])
nnclf_audio.fit(X_audio_train, y_audio_train)
y_pred_nnclf_audio = nnclf_audio.predict(X_audio_test)

nnclf_dropped = nnclf_pipe
nnclf_dropped.set_params(**best_nnclf_dropped.params.values[0])
nnclf_dropped.fit(X_dropped_train, y_dropped_train)
y_pred_nnclf_dropped = nnclf_dropped.predict(X_dropped_test)

nnclf_filled = nnclf_pipe
nnclf_filled.set_params(**best_nnclf_filled.params.values[0])
nnclf_filled.fit(X_filled_train, y_filled_train)
y_pred_nnclf_filled = nnclf_filled.predict(X_filled_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [50]:
nnclf_result_metrics_audio = pd.DataFrame(classification_report(
    y_audio_test, y_pred_nnclf_audio, output_dict=True))
nnclf_result_metrics_dropped = pd.DataFrame(classification_report(
    y_dropped_test, y_pred_nnclf_dropped, output_dict=True))
nnclf_result_metrics_filled = pd.DataFrame(classification_report(
    y_filled_test, y_pred_nnclf_filled, output_dict=True))

nnclf_result_metrics_audio['model_type'] = 'nnclf'
nnclf_result_metrics_dropped['model_type'] = 'nnclf'
nnclf_result_metrics_filled['model_type'] = 'nnclf'

audio_results_metrics = nnclf_result_metrics_audio
dropped_results_metrics = nnclf_result_metrics_dropped
filled_results_metrics = nnclf_result_metrics_filled

print('Accuracy for audio: {:.3f}'.format(
    nnclf_result_metrics_audio['accuracy'].iloc[0]))
print('Accuracy for dropped: {:.3f}'.format(
    nnclf_result_metrics_dropped['accuracy'].iloc[0]))
print('Accuracy for filled: {:.3f}'.format(
    nnclf_result_metrics_filled['accuracy'].iloc[0]))


Accuracy for audio: 0.315
Accuracy for dropped: 0.539
Accuracy for filled: 0.341


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
