# Base model

Final base model will be built and tested. Feature engineering is applied; numerical features are standardized.  
No Hyperparameter-Tuning yet.

In [1]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

In [2]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


### Train-Test-Split

In [3]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

df_train:  (79800, 21)
df_test:  (34200, 21)
df_test:  (22914, 21)
df_val:  (11286, 21)


### Data cleaning

In [4]:
from clean_data_func import clean_data

#apply clean_data function on train data
df_train_cleaned = clean_data(df_train)
display(df_train_cleaned.head())

#apply clean_data function on test and val data
df_test_cleaned = clean_data(df_test)
df_val_cleaned = clean_data(df_val)


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,popularity_cat
41996,7hUhmkALyQ8SX9mJs5XI3D,Love and Rockets,Love and Rockets,Motorcycle,22,211533,0,0.305,0.849,9,...,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4,goth,Low
76471,5x59U89ZnjZXuNAAlc8X1u,Filippa Giordano,Filippa Giordano,"Addio del passato - From ""La traviata""",22,196000,0,0.287,0.19,7,...,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4,opera,Low
54809,70Vng5jLzoJLmeLu3ayBQq,Susumu Yokota,Symbol,Purple Rose Minuet,37,216506,0,0.583,0.509,1,...,1,0.0362,0.777,0.202,0.115,0.544,90.459,3,idm,Medium
16326,1cRfzLJapgtwJ61xszs37b,Franz Liszt;YUNDI,Relajación y siestas,"Liebeslied (Widmung), S. 566",0,218346,0,0.163,0.0368,8,...,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3,classical,Unknown
109799,47d5lYjbiMy0EdMRV8lRou,Scooter,Scooter Forever,The Darkside,27,173160,0,0.647,0.921,2,...,1,0.185,0.000939,0.371,0.131,0.171,137.981,4,techno,Medium


### Feature Engineering

In [13]:
from feature_engineer_func import feature_engineer

#apply feature_engineer function on train data
df_train_final = feature_engineer(df_train_cleaned)
display(df_train_final.head())

#apply feature_engineer function on test and val data
df_test_final = feature_engineer(df_test_cleaned)
df_val_final = feature_engineer(df_val_cleaned)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,popularity_cat,artist_popularity,album_popularity,track_name_length
41996,7hUhmkALyQ8SX9mJs5XI3D,Love and Rockets,Love and Rockets,Motorcycle,22,211533,0,0.305,0.849,9,...,0.0567,0.464,0.32,141.793,4,goth,Low,22.0,22.0,10
76471,5x59U89ZnjZXuNAAlc8X1u,Filippa Giordano,Filippa Giordano,"Addio del passato - From ""La traviata""",22,196000,0,0.287,0.19,7,...,0.000356,0.0834,0.133,83.685,4,opera,Low,23.0,21.0,38
54809,70Vng5jLzoJLmeLu3ayBQq,Susumu Yokota,Symbol,Purple Rose Minuet,37,216506,0,0.583,0.509,1,...,0.202,0.115,0.544,90.459,3,idm,Medium,20.882353,37.0,18
16326,1cRfzLJapgtwJ61xszs37b,Franz Liszt;YUNDI,Relajación y siestas,"Liebeslied (Widmung), S. 566",0,218346,0,0.163,0.0368,8,...,0.899,0.107,0.0387,69.442,3,classical,Unknown,0.0,0.142857,28
109799,47d5lYjbiMy0EdMRV8lRou,Scooter,Scooter Forever,The Darkside,27,173160,0,0.647,0.921,2,...,0.371,0.131,0.171,137.981,4,techno,Medium,23.848485,26.5,12


In [17]:
# splitting train data into features and target without further feature engineering
features_to_drop = [
    'track_id',
    'artists',
    'album_name',
    'track_name',
    'track_genre',
    'popularity',
    'popularity_cat']

features_train = df_train_final.drop(features_to_drop, axis = 1)
target_train = df_train_final['popularity_cat']

# splitting test data into features and target
features_test = df_test_final.drop(features_to_drop, axis = 1)
target_test = df_test_final['popularity_cat']

# splitting val data into features and target
features_val = df_val_final.drop(features_to_drop, axis = 1)
target_val = df_val_final['popularity_cat']

In [18]:
# check features and target of train data
display(features_train.head(), features_train.shape)
display(target_train.head(), target_train.shape)

Unnamed: 0,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist_popularity,album_popularity,track_name_length
41996,211533,0,0.305,0.849,9,-10.795,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4,22.0,22.0,10
76471,196000,0,0.287,0.19,7,-12.03,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4,23.0,21.0,38
54809,216506,0,0.583,0.509,1,-9.661,1,0.0362,0.777,0.202,0.115,0.544,90.459,3,20.882353,37.0,18
16326,218346,0,0.163,0.0368,8,-23.149,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3,0.0,0.142857,28
109799,173160,0,0.647,0.921,2,-7.294,1,0.185,0.000939,0.371,0.131,0.171,137.981,4,23.848485,26.5,12


(62459, 17)

41996         Low
76471         Low
54809      Medium
16326     Unknown
109799     Medium
Name: popularity_cat, dtype: category
Categories (4, object): ['Unknown' < 'Low' < 'Medium' < 'High']

(62459,)

### Data preparation and training

In [19]:
# getting columns easy for copy-paste
features_train.columns

Index(['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'artist_popularity',
       'album_popularity', 'track_name_length'],
      dtype='object')

In [None]:
#### to do: Do the following 4 numerical (actual categorical) features need one hot encoding? Depending on the model? Or just put into the pipeline?
# - 'key' (0-11)
# - 'mode' (0-1)
# - 'time_signature' (0-4)
# - 'explicit' (0-1)

In [9]:
# defining pipelines to test with different models
# models of interest: Classifiers with "balanced weight" parameter, e.g. DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, others?

# old simple version, #### to adjust!
# pipeline_tree = Pipeline(steps=[('preprocessor', StandardScaler()),
#    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
#])

In [None]:
# training model - #### to adjust!
#pipeline_tree.fit(features_train, target_train)

# predicting on test data
#target_test_pred = pipeline_tree.predict(features_test)

# show metrics
#print('Accuracy: ', accuracy_score(target_test, target_test_pred))
#print('Precision: ', precision_score(target_test, target_test_pred, average='weighted'))
#print('Recall: ', recall_score(target_test, target_test_pred, average='weighted'))
#print('F1-Score: ', f1_score(target_test, target_test_pred, average='weighted'))
#print('Confusion Matrix: \n', confusion_matrix(target_test, target_test_pred), '\n')
#print('Classification Report: \n', classification_report(target_test, target_test_pred))

NameError: name 'target_test_pred' is not defined

In [None]:
# predicting on val data - #### to adjust!
#target_val_pred = pipeline_tree.predict(features_val)

# show metrics
#print('Confusion Matrix: \n', confusion_matrix(target_val, target_val_pred), '\n')
#print('Classification Report: \n', classification_report(target_val, target_val_pred))

In [None]:
# check cross validation score - #### to adjust!
#cv_results = cross_val_score(estimator=pipeline_tree,
#                            X=features_train,
#                            y=target_train,
#                            cv=5,
#                            scoring='f1_weighted',
#                            n_jobs=-1)
#cv_results.mean()