In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA


In [2]:
'''
Nom des 519 features
''' 
features = pd.read_csv(filepath_or_buffer="features_head.csv", sep=",")
#print(features.columns)


'''
Croisement features/tracks du dataset train
''' 
traingenre = pd.read_csv(filepath_or_buffer="train_clean.csv", sep=",")
iter_csv = pd.read_csv(filepath_or_buffer="features_adapte.csv", sep=",", iterator=True, chunksize=10000)
datatrain = pd.concat([chunk for chunk in iter_csv])
data = pd.merge(traingenre, datatrain, on='track_id')

print(data.shape, traingenre.shape, datatrain.shape)
#print(X_train.shape)
#print(X_train.shape[1])
#print(y[1])

(3995, 520) (3995, 2) (106574, 519)


In [34]:
'''
Preparing Data
'''

X = data.drop('genre_id',axis=1)
y = data['genre_id'].values

#for i in range(len(y)):
#    y[i] -= 1
#y=to_categorical(y)

print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

(3995, 519)
(3995,)


In [35]:
'''
Multi-layer Perceptron Classifier
'''

clf = MLPClassifier(solver='lbfgs', 
                    alpha=0.001, 
                    hidden_layer_sizes=(100), 
                    random_state=1)
clf.fit(X_train, y_train)


MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=100, learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [36]:
## Evaluation of the Multi-layer Perceptron Classifier

predictions = clf.predict(X_test)
#print(predictions)
#print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           1       0.40      0.41      0.41       179
           2       0.22      0.38      0.28       161
           3       0.34      0.27      0.30       170
           4       0.11      0.05      0.07       166
           5       0.14      0.15      0.15       165
           6       0.11      0.11      0.11       169
           7       0.17      0.20      0.18       157
           8       0.10      0.07      0.08       152

    accuracy                           0.21      1319
   macro avg       0.20      0.20      0.20      1319
weighted avg       0.20      0.21      0.20      1319



In [37]:
'''
Nearest Neighbors Classifier
'''
print("PCA...")
pca = PCA(n_components=40).fit(X_train)
X_reduce = pca.transform(X_train)

print("Training KNN...")
model = KNeighborsClassifier(n_jobs=-1,n_neighbors=5,weights='uniform')
model.fit(X_reduce, y_train)


PCA...
Training KNN...


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [38]:
## Evaluation of the Nearest Neighbors Classifier

#predictions = model.predict(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
train_acc = model.score(X_reduce, y_train)
test_acc = model.score(pca.transform(X_test), y_test)

#print(predictions)
#print(y_test)
#print(confusion_matrix(y_test,predictions))
#print(classification_report(y_test,predictions))

print(f'Training accuracy: {train_acc * 100:.2f}%')
print(f'Test accuracy: {test_acc * 100:.2f}%')

(2676, 519)
(2676,)
(1319, 519)
(1319,)
Training accuracy: 62.07%
Test accuracy: 46.10%


In [39]:
'''
Linear Regression Classifier
'''

model = SGDClassifier(loss="log")
model.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [40]:
## Evaluation of the Linear Regression Classifier

train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print(f'Training accuracy: {train_acc * 100:.2f}%')
print(f'Test accuracy: {test_acc * 100:.2f}%')

Training accuracy: 17.15%
Test accuracy: 15.85%


In [41]:
'''
Logistic Regression Classifier
'''

model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
## Evaluation of the Logistic Regression Classifier

train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print(f'Training accuracy: {train_acc * 100:.2f}%')
print(f'Test accuracy: {test_acc * 100:.2f}%')

Training accuracy: 67.00%
Test accuracy: 57.16%


In [43]:
'''
Decision Tree Classifier
'''

model = DecisionTreeClassifier(max_depth = 10, random_state=42)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [44]:
# Compute accuracy on training and test sets
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print(f'Training accuracy: {train_acc * 100:.2f}%')
print(f'Test accuracy: {test_acc * 100:.2f}%')

Training accuracy: 82.44%
Test accuracy: 38.82%


In [44]:
'''
XGBoost Classifier
'''

model = XGBClassifier(n_estimators=700,learning_rate=0.2, max_depth=5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=None, n_estimators=700,
              n_jobs=1, nthread=None, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [45]:
## Evaluation of the XGBoost Classifier

train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print(f'Training accuracy: {train_acc * 100:.2f}%')
print(f'Test accuracy: {test_acc * 100:.2f}%')

Training accuracy: 100.00%
Test accuracy: 62.77%
