# Base model

Final base model will be built and tested. Feature engineering is applied; numerical features are standardized.  
No Hyperparameter-Tuning yet.

In [None]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from src.features.data_prep_for_model import clean_data, feature_engineer, prep_data_for_model, pipeline_classifier

In [None]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

### Data preparation

In [None]:
# get features and target sets for train, test and val data from function output
features_train, target_train, features_test, target_test, features_val, target_val = prep_data_for_model(df)

features_train.head()

In [None]:
# getting columns easy for copy-paste
print(features_train.columns)

# specific categories (for onehotencoding) and num cols list for pipeline
CAT_COLS = ['key', 'time_signature']

NUM_COLS = [col for col in features_train.columns if col not in CAT_COLS]

print(CAT_COLS)
print(NUM_COLS)

### Model training
Models of interest: Classifiers with "balanced weight" parameter like RandomForestClassifier and LogisticRegression.

In [None]:
# test model #1: RandomForestClassifier
# use pipeline function for preprocessing
pipeline_rfc = pipeline_classifier(cat_cols=CAT_COLS,
                                   num_cols=NUM_COLS,
                                   classifier=RandomForestClassifier,
                                   class_weight='balanced',
                                   random_state=42)

# train model and predict on test data
pipeline_rfc.fit(features_train, target_train)
target_test_pred = pipeline_rfc.predict(features_test)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_test, target_test_pred), '\n')
print('Classification Report: \n', classification_report(target_test, target_test_pred))

# predict on val data
target_val_pred = pipeline_rfc.predict(features_val)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_val, target_val_pred), '\n')
print('Classification Report: \n', classification_report(target_val, target_val_pred))

# save classification report of val data in results folder of src to load it later for direct comparison
rfc_model_classification_report = classification_report(target_val, target_val_pred, output_dict=True)
rfc_model_classification_report = pd.DataFrame(rfc_model_classification_report).transpose()
rfc_model_classification_report.columns = ['precision_rfc', 'recall_rfc', 'f1_score_rfc', 'support_rfc']
rfc_model_classification_report.to_csv('classification_reports/rfc_model_classification_report.csv')

In [None]:
# test model #2: Logistic Regression
# use pipeline function for preprocessing
pipeline_log = pipeline_classifier(cat_cols=CAT_COLS,
                                   num_cols=NUM_COLS,
                                   classifier=LogisticRegression,
                                   max_iter=1000,
                                   C=0.5,
                                   class_weight='balanced',
                                   random_state=42)

# train model and predict on test data
pipeline_log.fit(features_train, target_train)
target_test_pred = pipeline_log.predict(features_test)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_test, target_test_pred), '\n')
print('Classification Report: \n', classification_report(target_test, target_test_pred))

# predict on val data
target_val_pred = pipeline_log.predict(features_val)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_val, target_val_pred), '\n')
print('Classification Report: \n', classification_report(target_val, target_val_pred))

# save classification report of val data in results folder of src to load it later for direct comparison
log_model_classification_report = classification_report(target_val, target_val_pred, output_dict=True)
log_model_classification_report = pd.DataFrame(log_model_classification_report).transpose()
log_model_classification_report.columns = ['precision_log', 'recall_log', 'f1_score_log', 'support_log']
log_model_classification_report.to_csv('classification_reports/log_model_classification_report.csv')

In [None]:
# compare classification reports of all models (skip last 2 rows of macro avg and weighted avg)
simple_baseline_report = pd.read_csv('classification_reports/simple_model_classification_report.csv', index_col=0, skiprows=[6, 7])
rfc_model_report = pd.read_csv('classification_reports/rfc_model_classification_report.csv', index_col=0, skiprows=[6, 7])
log_model_report = pd.read_csv('classification_reports/log_model_classification_report.csv', index_col=0, skiprows=[6, 7])

reports_combined = pd.concat([log_model_report, simple_baseline_report, rfc_model_report], axis=1)
pd.options.display.float_format = '{:.2f}'.format

reports_combined

In [None]:
# check cross validation score of model #1: RandomForestClassifier
cv_results_rfc = cross_val_score(estimator=pipeline_rfc,
                            X=features_train,
                            y=target_train,
                            cv=5,
                            scoring='f1_weighted',
                            n_jobs=-1)
cv_results_rfc.mean()

In [None]:
# check cross validation score of model #2: LogisticRegression
cv_results_log = cross_val_score(estimator=pipeline_log,
                            X=features_train,
                            y=target_train,
                            cv=5,
                            scoring='f1_weighted',
                            n_jobs=-1)
cv_results_log.mean()

### Model interpretation (only for favored random forest model, no hypertuning yet)

In [None]:
## Final model chosen based on previous scores: RandomForestClassifier
# check feature_importances_

# get the classifier and preprocessor
model = pipeline_rfc.named_steps['classifier']
preprocessor = pipeline_rfc.named_steps['preprocessor']

# get feature names after ColumnTransformer
num_features = preprocessor.transformers_[0][2]
cat_features = preprocessor.transformers_[1][1].get_feature_names_out(preprocessor.transformers_[1][2])
all_features = np.concatenate([num_features, cat_features])

# get feature importances
importances = model.feature_importances_

# combine into a DataFrame
feature_importances = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# plot top N
top_n = 20
fig, ax = plt.subplots(figsize=(10, 6))

ax.barh(feature_importances.head(top_n).iloc[::-1]['feature'],
         feature_importances.head(top_n).iloc[::-1]['importance'])
ax.set_xlabel("Feature Importance")
ax.set_title(f"Top {top_n} features of final model")
plt.tight_layout()

In [None]:
# Computing learning curve (could take some time)
train_sizes, train_scores, test_scores = learning_curve(estimator=RandomForestClassifier(class_weight='balanced', random_state = 42), 
                                                        X=features_train, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='f1_weighted',
                                                        n_jobs=-1,
                                                        train_sizes=np.linspace(0.1, 1.0, 5))

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
fig_lc, ax = plt.subplots(figsize=(6,4))
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve")
ax.set_xlabel("Training Set Size")
ax.set_ylabel("F1-Score (weighted)")
ax.legend(loc="best")
fig_lc;

### Final Base Model

In [None]:
# best model (from hypertuning on f1_weighted)
best_params = {'n_estimators': 182, 
               'max_depth': 15,
               'max_features': 'sqrt',
               'min_samples_split': 8,
               'min_samples_leaf': 2}

In [None]:
# training and evaluating predictions on test data
best_model = RandomForestClassifier(class_weight='balanced', random_state = 42, **best_params)

best_model.fit(features_train, target_train)

target_test_pred = best_model.predict(features_test)

print('f1_weighted score: \n', f1_score(target_test, target_test_pred, average='weighted'), '\n')

In [None]:
# training and evaluating predictions on val data
best_model = RandomForestClassifier(class_weight='balanced', random_state = 42, **best_params)

best_model.fit(features_train, target_train)

target_val_pred = best_model.predict(features_val)

print('f1_weighted score: \n', f1_score(target_val, target_val_pred, average='weighted'), '\n')

In [None]:
# save classification report of val data in results folder of src to load it later for direct comparison
rfc_best_model_classification_report = classification_report(target_val, target_val_pred, output_dict=True)
rfc_best_model_classification_report = pd.DataFrame(rfc_model_classification_report).transpose()
rfc_best_model_classification_report.columns = ['precision_rfc_best', 'recall_rfc_best', 'f1_score_rfc_best', 'support_rfc_best']
rfc_best_model_classification_report.to_csv('classification_reports/rfc_best_model_classification_report.csv')

In [None]:
# compare classification reports of all models (skip last 2 rows of macro avg and weighted avg)
simple_baseline_report = pd.read_csv('classification_reports/simple_model_classification_report.csv', index_col=0, skiprows=[6, 7])
rfc_model_report = pd.read_csv('classification_reports/rfc_model_classification_report.csv', index_col=0, skiprows=[6, 7])
log_model_report = pd.read_csv('classification_reports/log_model_classification_report.csv', index_col=0, skiprows=[6, 7])
rfc_best_model_report = pd.read_csv('classification_reports/rfc__best_model_classification_report.csv', index_col=0, skiprows=[6, 7])

reports_combined = pd.concat([log_model_report, simple_baseline_report, rfc_model_report], axis=1)
pd.options.display.float_format = '{:.2f}'.format

reports_combined

In [None]:
# check feature_importances_

# get the classifier and preprocessor
model = pipeline_rfc.named_steps['classifier']
preprocessor = pipeline_rfc.named_steps['preprocessor']

# get feature names after ColumnTransformer
num_features = preprocessor.transformers_[0][2]
cat_features = preprocessor.transformers_[1][1].get_feature_names_out(preprocessor.transformers_[1][2])
all_features = np.concatenate([num_features, cat_features])

# get feature importances
importances = model.feature_importances_

# combine into a DataFrame
feature_importances = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# plot top N
top_n = 20
fig, ax = plt.subplots(figsize=(10, 6))

ax.barh(feature_importances.head(top_n).iloc[::-1]['feature'],
         feature_importances.head(top_n).iloc[::-1]['importance'])
ax.set_xlabel("Feature Importance")
ax.set_title(f"Top {top_n} features of final model")
plt.tight_layout()

In [None]:
# Computing learning curve (could take some time)
train_sizes, train_scores, test_scores = learning_curve(estimator=RandomForestClassifier(class_weight='balanced', random_state = 42), 
                                                        X=features_train, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='f1_weighted',
                                                        n_jobs=-1,
                                                        train_sizes=np.linspace(0.1, 1.0, 5))

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
fig_lc, ax = plt.subplots(figsize=(6,4))
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve")
ax.set_xlabel("Training Set Size")
ax.set_ylabel("F1-Score (weighted)")
ax.legend(loc="best")
fig_lc;