In [37]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn import preprocessing
import random

In [26]:
""" Loading the data """

#train
train_clean = pd.read_csv(filepath_or_buffer="train_clean.csv", sep=",")
train_data = pd.read_csv(filepath_or_buffer="train_data.csv", sep=",")
train = pd.merge(train_clean, train_data, on='track_id')
print(f"train data: {train.shape}")

#test
test = pd.read_csv(filepath_or_buffer="test_data.csv", sep=",")
print(f"test data: {test.shape}")

#ids
ids = pd.read_csv(filepath_or_buffer="submission.csv", sep=",")
ids.drop(['genre_id'], axis=1, inplace=True)
print(f"ids: {ids.shape}")

train data: (3995, 541)
test data: (4006, 540)
ids: (4008, 1)


In [27]:
""" Constructing the training and test sets """

X_train = train.drop(['genre_id', 'track_id'], axis=1)
y_train = train['genre_id'].values
X_test  = test.drop(['track_id'], axis=1)
test_id = test['track_id'].values

print(f"x_train: {x_train.shape}, y_train: {y_train.shape}, x_test: {x_test.shape}, test_id: {test_id.shape}")

x_train: (3995, 539), y_train: (3995,), x_test: (4006, 539), test_id: (4006,)


In [28]:
""" Normalisation """

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [29]:
'''
XGBoost Classifier
'''

model = XGBClassifier(n_estimators=500,learning_rate=0.2, max_depth=5)
model.fit(X_train, y_train)
predictions = model.predict(x_test)

In [39]:
''' Output '''

output = pd.DataFrame({'track_id': test_id, 'genre_id': predictions})
output = output.merge(ids, on='track_id', how='right')
# fill missing id with a random genre
output['genre_id'] = output['genre_id'].apply(lambda x: random.randint(1,8) if np.isnan(x) else x)
output['genre_id'] = output['genre_id'].apply(int)
output.set_index('track_id', inplace=True)


output.to_csv('submissionKaggle.csv')
