# Base model

Final base model will be built and tested. Feature engineering is applied; numerical features are standardized.  
No Hyperparameter-Tuning yet.

In [None]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

In [None]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

### Train-Test-Split

In [None]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

### Data cleaning

In [None]:
from src.features.clean_data_func import clean_data

#apply clean_data function on train data
df_train_cleaned = clean_data(df_train)
display(df_train_cleaned.head())

#apply clean_data function on test and val data
df_test_cleaned = clean_data(df_test)
df_val_cleaned = clean_data(df_val)


### Feature Engineering

In [None]:
from src.features.feature_engineer_func import feature_engineer

#apply feature_engineer function on train data
df_train_final = feature_engineer(df_train_cleaned)
display(df_train_final.head())

#apply feature_engineer function on test and val data
df_test_final = feature_engineer(df_test_cleaned)
df_val_final = feature_engineer(df_val_cleaned)


In [None]:
# splitting train data into features and target without further feature engineering
features_to_drop = [
    'track_id',
    'artists',
    'album_name',
    'track_name',
    'track_genre',
    'popularity',
    'popularity_cat']

features_train = df_train_final.drop(features_to_drop, axis = 1)
target_train = df_train_final['popularity_cat']

# splitting test data into features and target
features_test = df_test_final.drop(features_to_drop, axis = 1)
target_test = df_test_final['popularity_cat']

# splitting val data into features and target
features_val = df_val_final.drop(features_to_drop, axis = 1)
target_val = df_val_final['popularity_cat']

In [None]:
# check features and target of train data
display(features_train.head(), features_train.shape)
display(target_train.head(), target_train.shape)

### Data preparation and training

In [None]:
# getting columns easy for copy-paste
print(features_train.columns)

# specific category (for onehotencoding) and num cols list for pipeline
CAT_COLS = ['key', 'time_signature']

NUM_COLS = [col for col in features_train.columns if col not in CAT_COLS]

print(CAT_COLS)
print(NUM_COLS)

In [None]:
# defining pipelines to test with different models
# models of interest: Classifiers with "balanced weight" parameter, e.g. DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, others?

# preprocessing: scale numeric features, one-hot-encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUM_COLS),
        ('cat', OneHotEncoder(handle_unknown='ignore'), CAT_COLS)
    ]
)

# pipeline #1: RandomForestClassifier
pipeline_rfc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

In [None]:
# train model #1
pipeline_rfc.fit(features_train, target_train)

# predict on test data
target_test_pred = pipeline_rfc.predict(features_test)

# show metrics
#print('Accuracy: ', accuracy_score(target_test, target_test_pred))
#print('Precision: ', precision_score(target_test, target_test_pred, average='weighted'))
#print('Recall: ', recall_score(target_test, target_test_pred, average='weighted'))
#print('F1-Score: ', f1_score(target_test, target_test_pred, average='weighted'))
print('Confusion Matrix: \n', confusion_matrix(target_test, target_test_pred), '\n')
print('Classification Report: \n', classification_report(target_test, target_test_pred))

In [None]:
# predict on val data
target_val_pred = pipeline_rfc.predict(features_val)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_val, target_val_pred), '\n')
print('Classification Report: \n', classification_report(target_val, target_val_pred))

In [None]:
# save classification report of val data in results folder of src to load it in final base model for direct comparison
final_model_classification_report = classification_report(target_val, target_val_pred, output_dict=True)
final_model_classification_report = pd.DataFrame(final_model_classification_report).transpose()
final_model_classification_report.columns = ['precision_final', 'recall_final', 'f1_score_final', 'support_final']
final_model_classification_report.to_csv('src/results/final_model_classification_report.csv')

In [None]:
# compare classification report with simple baseline model (skip last 2 rows of macro avg and weighted avg)
simple_baseline_report = pd.read_csv('src/results/simple_model_classification_report.csv', index_col=0, skiprows=[6, 7])
final_model_report = pd.read_csv('src/results/final_model_classification_report.csv', index_col=0, skiprows=[6, 7])

#display('Simple baseline model classification report:', simple_baseline_report)
#display('Final model classification report:', final_model_report)

reports_combined = pd.concat([simple_baseline_report, final_model_report], axis=1)
pd.options.display.float_format = '{:.2f}'.format

reports_combined

In [None]:
# check cross validation score
cv_results = cross_val_score(estimator=pipeline_rfc,
                            X=features_train,
                            y=target_train,
                            cv=5,
                            scoring='f1_weighted',
                            n_jobs=-1)
cv_results.mean()

### Model interpretation

In [None]:
# check feature_importances_

# get the classifier and preprocessor
model = pipeline_rfc.named_steps['classifier']
preprocessor = pipeline_rfc.named_steps['preprocessor']

# get feature names after ColumnTransformer
num_features = preprocessor.transformers_[0][2]
cat_features = preprocessor.transformers_[1][1].get_feature_names_out(preprocessor.transformers_[1][2])
all_features = np.concatenate([num_features, cat_features])

# get feature importances
importances = model.feature_importances_

# combine into a DataFrame
feature_importances = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# plot top N
top_n = 20
fig, ax = plt.subplots(figsize=(10, 6))

ax.barh(feature_importances.head(top_n).iloc[::-1]['feature'],
         feature_importances.head(top_n).iloc[::-1]['importance'])
ax.set_xlabel("Feature Importance")
ax.set_title(f"Top {top_n} features of final model")
plt.tight_layout()


In [None]:
# Computing learning curve (could take some time)
train_sizes, train_scores, test_scores = learning_curve(estimator=RandomForestClassifier(class_weight='balanced', random_state = 42), 
                                                        X=features_train, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='f1_weighted',
                                                        n_jobs=-1,
                                                        train_sizes=np.linspace(0.1, 1.0, 5))

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
fig_lc, ax = plt.subplots(figsize=(6,4))
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve")
ax.set_xlabel("Training Set Size")
ax.set_ylabel("F1-Score (weighted)")
ax.legend(loc="best")
fig_lc;