# Simple base line model (for comparison with final model)

A simple base model will be built and tested.  
Feature engineering is not applied except simple one hot encoding for suitable categorical features with a low amount of unique values only; numerical features are also standardized.  
No Hyperparameter-Tuning yet.

In [13]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

In [14]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


### Train-Test-Split

In [15]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

df_train:  (79800, 21)
df_test:  (34200, 21)
df_test:  (22914, 21)
df_val:  (11286, 21)


### Data cleaning

In [16]:
from clean_data_func import clean_data

#apply clean_data function on train data
df_train_cleaned = clean_data(df_train)
display(df_train_cleaned.head())

#apply clean_data function on test and val data
df_test_cleaned = clean_data(df_test)
df_val_cleaned = clean_data(df_val)


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,popularity_cat
41996,7hUhmkALyQ8SX9mJs5XI3D,Love and Rockets,Love and Rockets,Motorcycle,22,211533,0,0.305,0.849,9,...,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4,goth,Low
76471,5x59U89ZnjZXuNAAlc8X1u,Filippa Giordano,Filippa Giordano,"Addio del passato - From ""La traviata""",22,196000,0,0.287,0.19,7,...,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4,opera,Low
54809,70Vng5jLzoJLmeLu3ayBQq,Susumu Yokota,Symbol,Purple Rose Minuet,37,216506,0,0.583,0.509,1,...,1,0.0362,0.777,0.202,0.115,0.544,90.459,3,idm,Medium
16326,1cRfzLJapgtwJ61xszs37b,Franz Liszt;YUNDI,Relajación y siestas,"Liebeslied (Widmung), S. 566",0,218346,0,0.163,0.0368,8,...,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3,classical,New
109799,47d5lYjbiMy0EdMRV8lRou,Scooter,Scooter Forever,The Darkside,27,173160,0,0.647,0.921,2,...,1,0.185,0.000939,0.371,0.131,0.171,137.981,4,techno,Low


In [17]:
# splitting train data into features and target without further feature engineering
features_to_drop = [
    'track_id',
    'artists',
    'album_name',
    'track_name',
    'track_genre',
    'popularity',
    'popularity_cat']

features_train = df_train_cleaned.drop(features_to_drop, axis = 1)
target_train = df_train_cleaned['popularity_cat']

# splitting test data into features and target
features_test = df_test_cleaned.drop(features_to_drop, axis = 1)
target_test = df_test_cleaned['popularity_cat']

# splitting val data into features and target
features_val = df_val_cleaned.drop(features_to_drop, axis = 1)
target_val = df_val_cleaned['popularity_cat']

In [29]:
# check features and target of train data
display(features_train.head(), features_train.shape)
display(target_train.head(), target_train.shape)

Unnamed: 0,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
41996,211533,0,0.305,0.849,9,-10.795,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4
76471,196000,0,0.287,0.19,7,-12.03,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4
54809,216506,0,0.583,0.509,1,-9.661,1,0.0362,0.777,0.202,0.115,0.544,90.459,3
16326,218346,0,0.163,0.0368,8,-23.149,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3
109799,173160,0,0.647,0.921,2,-7.294,1,0.185,0.000939,0.371,0.131,0.171,137.981,4


(62459, 14)

41996        Low
76471        Low
54809     Medium
16326        New
109799       Low
Name: popularity_cat, dtype: category
Categories (4, object): ['New' < 'Low' < 'Medium' < 'High']

(62459,)

### Data preparation and training

In [19]:
# getting columns easy for copy-paste
features_train.columns

Index(['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature'],
      dtype='object')

In [20]:
## classifier DecisionTreeClassifier() can handle numerical (which are actual categorical) features well:
# - 'key' (0-11)
# - 'mode' (0-1)
# - 'time_signature' (0-4)
# - 'explicit' (0-1)

In [21]:
# defining simple base pipeline with DecisionTreeClassifier
pipeline_tree = Pipeline(steps=[('preprocessor', StandardScaler()),
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

In [22]:
# training model
pipeline_tree.fit(features_train, target_train)

# predicting on test data
target_test_pred = pipeline_tree.predict(features_test)

# show metrics
#print('Accuracy: ', accuracy_score(target_test, target_test_pred))
#print('Precision: ', precision_score(target_test, target_test_pred, average='weighted'))
#print('Recall: ', recall_score(target_test, target_test_pred, average='weighted'))
#print('F1-Score: ', f1_score(target_test, target_test_pred, average='weighted'))
print('Confusion Matrix: \n', confusion_matrix(target_test, target_test_pred), '\n')
print('Classification Report: \n', classification_report(target_test, target_test_pred))

Confusion Matrix: 
 [[ 761  143  297   58]
 [ 285 3979 2576  609]
 [ 477 2277 6435  536]
 [  83  372  274 1295]] 

Classification Report: 
               precision    recall  f1-score   support

        High       0.47      0.60      0.53      1259
         Low       0.59      0.53      0.56      7449
      Medium       0.67      0.66      0.67      9725
         New       0.52      0.64      0.57      2024

    accuracy                           0.61     20457
   macro avg       0.56      0.61      0.58     20457
weighted avg       0.61      0.61      0.61     20457



In [23]:
# predicting on val data
target_val_pred = pipeline_tree.predict(features_val)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_val, target_val_pred), '\n')
print('Classification Report: \n', classification_report(target_val, target_val_pred))

Confusion Matrix: 
 [[ 419   95  140   49]
 [ 160 2029 1358  326]
 [ 210 1151 3191  237]
 [  49  215  133  792]] 

Classification Report: 
               precision    recall  f1-score   support

        High       0.50      0.60      0.54       703
         Low       0.58      0.52      0.55      3873
      Medium       0.66      0.67      0.66      4789
         New       0.56      0.67      0.61      1189

    accuracy                           0.61     10554
   macro avg       0.58      0.61      0.59     10554
weighted avg       0.61      0.61      0.61     10554



In [24]:
# check cross validation score
cv_results = cross_val_score(estimator=pipeline_tree,
                            X=features_train,
                            y=target_train,
                            cv=5,
                            scoring='f1_weighted',
                            n_jobs=-1)
cv_results.mean()

np.float64(0.46195863237346824)