In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

In [2]:
# Load data of features
songs = pd.read_csv("data_100_genre.csv", sep=',')

# Only get rows with complete preview_url
songs_complete = songs[songs[['preview_url']].notnull().all(1)]

# Keep necessary variables
songs_clean = songs_complete[['track_name', 'track_genre',
                     'key', 'loudness', 'mode', 'tempo', 'time_signature', 
                     'energy', 'danceability', 'speechiness', 'acousticness',
                     'instrumentalness', 'liveness', 'valence']]


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Encode outcome ordinally
enc = OrdinalEncoder()
songs_clean[['track_genre']] = enc.fit_transform(songs_clean[['track_genre']])

# Train-test split
train, test = train_test_split(songs_clean, test_size=0.25, random_state=5)

# Split outcome from predictors
train_y = train['track_genre']
train_X = train.drop(columns=['track_genre', 'track_name'])
train_names = train['track_name']
test_y = test['track_genre']
test_X = test.drop(columns=['track_genre', 'track_name'])
test_names = test['track_name']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_clean[['track_genre']] = enc.fit_transform(songs_clean[['track_genre']])


In [11]:
# Pipelines for numeric and categorical variables
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
import xgboost as xgb

## Seperate pipelines for numeric vs. cat vars
num_transformer = Pipeline([("scaler", MinMaxScaler())])
cat_transformer = Pipeline([("encoder", OneHotEncoder(sparse_output=False, 
                                                      handle_unknown='ignore'))])

## Column ids
cat_cols = ['key', 'mode']
num_cols = ['loudness', 'tempo', 'time_signature', 
            'energy', 'danceability', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence']

## Pre-processor
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

## Final pipeline
pipe = Pipeline([
  ('preprocessor', preprocessor),
  ('model', xgb.XGBClassifier(eval_metric='error'))
])

pipe.fit(train_X, train_y)


In [12]:
# Out-of-sample accuracy
np.average(cross_val_score(pipe, train_X, train_y, cv = 5))


0.3833333333333333

In [13]:
# Evaluate on test data
pipe.score(test_X, test_y)

0.48