# Base model

Final base model will be built and tested. Feature engineering is applied; numerical features are standardized.  
No Hyperparameter-Tuning yet.

In [1]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from src.features.data_prep_for_model import clean_data, feature_engineer, prep_data_for_model

In [2]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [None]:
# get features and target sets for train, test and val data from function output
features_train, target_train, features_test, target_test, features_val, target_val = prep_data_for_model(df)

features_train.head()

Unnamed: 0,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,tracks_per_artist,track_name_length,album_name_length
41996,211533,0,0.305,0.849,9,-10.795,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4,1,10,16
76471,196000,0,0.287,0.19,7,-12.03,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4,7,38,16
54809,216506,0,0.583,0.509,1,-9.661,1,0.0362,0.777,0.202,0.115,0.544,90.459,3,17,18,6
16326,218346,0,0.163,0.0368,8,-23.149,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3,1,28,20
109799,173160,0,0.647,0.921,2,-7.294,1,0.185,0.000939,0.371,0.131,0.171,137.981,4,99,12,15


### Data preparation and training

In [None]:
# getting columns easy for copy-paste
print(features_train.columns)

# specific categories (for onehotencoding) and num cols list for pipeline
CAT_COLS = ['key', 'time_signature']

NUM_COLS = [col for col in features_train.columns if col not in CAT_COLS]

print(CAT_COLS)
print(NUM_COLS)

In [None]:
# defining pipelines to test with different models
# models of interest: Classifiers with "balanced weight" parameter, e.g. DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, others?

# preprocessing: scale numeric features, one-hot-encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUM_COLS),
        ('cat', OneHotEncoder(handle_unknown='ignore'), CAT_COLS)
    ]
)

# pipeline #1: RandomForestClassifier
pipeline_rfc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

In [None]:
# train model #1
pipeline_rfc.fit(features_train, target_train)

# predict on test data
target_test_pred = pipeline_rfc.predict(features_test)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_test, target_test_pred), '\n')
print('Classification Report: \n', classification_report(target_test, target_test_pred))

In [None]:
# predict on val data
target_val_pred = pipeline_rfc.predict(features_val)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_val, target_val_pred), '\n')
print('Classification Report: \n', classification_report(target_val, target_val_pred))

In [None]:
# save classification report of val data in results folder of src to load it in final base model for direct comparison
final_model_classification_report = classification_report(target_val, target_val_pred, output_dict=True)
final_model_classification_report = pd.DataFrame(final_model_classification_report).transpose()
final_model_classification_report.columns = ['precision_final', 'recall_final', 'f1_score_final', 'support_final']
final_model_classification_report.to_csv('classification_reports/final_model_classification_report.csv')

In [None]:
# compare classification report with simple baseline model (skip last 2 rows of macro avg and weighted avg)
simple_baseline_report = pd.read_csv('classification_reports/simple_model_classification_report.csv', index_col=0, skiprows=[6, 7])
final_model_report = pd.read_csv('classification_reports/final_model_classification_report.csv', index_col=0, skiprows=[6, 7])

#display('Simple baseline model classification report:', simple_baseline_report)
#display('Final model classification report:', final_model_report)

reports_combined = pd.concat([simple_baseline_report, final_model_report], axis=1)
pd.options.display.float_format = '{:.2f}'.format

reports_combined

In [None]:
# check cross validation score
cv_results = cross_val_score(estimator=pipeline_rfc,
                            X=features_train,
                            y=target_train,
                            cv=5,
                            scoring='f1_weighted',
                            n_jobs=-1)
cv_results.mean()

### Model interpretation

In [None]:
# check feature_importances_

# get the classifier and preprocessor
model = pipeline_rfc.named_steps['classifier']
preprocessor = pipeline_rfc.named_steps['preprocessor']

# get feature names after ColumnTransformer
num_features = preprocessor.transformers_[0][2]
cat_features = preprocessor.transformers_[1][1].get_feature_names_out(preprocessor.transformers_[1][2])
all_features = np.concatenate([num_features, cat_features])

# get feature importances
importances = model.feature_importances_

# combine into a DataFrame
feature_importances = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# plot top N
top_n = 20
fig, ax = plt.subplots(figsize=(10, 6))

ax.barh(feature_importances.head(top_n).iloc[::-1]['feature'],
         feature_importances.head(top_n).iloc[::-1]['importance'])
ax.set_xlabel("Feature Importance")
ax.set_title(f"Top {top_n} features of final model")
plt.tight_layout()


In [None]:
# Computing learning curve (could take some time)
train_sizes, train_scores, test_scores = learning_curve(estimator=RandomForestClassifier(class_weight='balanced', random_state = 42), 
                                                        X=features_train, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='f1_weighted',
                                                        n_jobs=-1,
                                                        train_sizes=np.linspace(0.1, 1.0, 5))

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
fig_lc, ax = plt.subplots(figsize=(6,4))
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve")
ax.set_xlabel("Training Set Size")
ax.set_ylabel("F1-Score (weighted)")
ax.legend(loc="best")
fig_lc;