### All final visualizations for presentation are tested here

In [None]:
# setup

#importing modules
import os
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from src.features.data_prep_for_model import clean_data, feature_engineer, prep_data_for_model, pipeline_classifier
from src.models.final_model import final_pipeline, get_feature_importances

# global constants
DPI = 100
FIGSIZE_LARGE = (10, 5)
FIGSIZE_SMALL = (5, 5)

# cat and num cols listed for EDA related plots only:
# removed track_id from cat_cols
CAT_COLS_EDA = [
    'artists',
    'album_name',
    'track_name',
    'explicit',
    'track_genre'
]

NUM_COLS_EDA = [
    'popularity',
    'duration_ms',
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'time_signature'
]

# create a label dict with better feature names for visualization later
LABEL_MAP= {
    'acousticness': 'Acousticness',
    'instrumentalness': 'Instrumentalness',
    'album_name_length': 'Length of Album Name',
    'duration_ms': 'Track Duration',
    'speechiness': 'Speechiness',
    'danceability': 'Danceability',
    'tracks_per_artist': 'Tracks per artist',
    'energy': 'Energy',
    'valence': 'Positiveness',
    'loudness': 'Track Volume',
    'artists': 'Artists',
    'album_name': 'Album Name',
    'track_name': 'Track_name',
    'track_name_length': 'Length of track name',
    'popularity': 'Popularity',
    'explicit': 'Explicit Lyrics',
    'key': 'Key (Pitch) of Track',
    'liveness': 'Liveness',
    'tempo': 'Tempo (BPM)',
    'time_signature': 'Time Sig. (Beats per bar)',
    'track_genre': 'Genre',
    'mode': 'Tonality/Mode'
}

In [None]:
# plot style

import matplotlib as mpl

# Set the figure size and DPI for high resolution
# mpl.rcParams['figure.figsize'] = (10, 6)  # Size in inches
# mpl.rcParams['figure.dpi'] = 300  # High resolution for clarity

# Set the font size for titles and labels
mpl.rcParams['font.size'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12

# Set the line width and marker size
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams['lines.markersize'] = 8

# Use a grid for better readability
mpl.rcParams['axes.grid'] = False
mpl.rcParams['grid.color'] = 'gray'
mpl.rcParams['grid.alpha'] = 0.5

# Set borders and ticks to a grey color
mpl.rcParams['axes.edgecolor'] = 'gray'
mpl.rcParams['xtick.color'] = 'gray'
mpl.rcParams['ytick.color'] = 'gray'

# Set the main plot title color to grey
mpl.rcParams['axes.titlecolor'] = 'gray'

# Set the axis title color to grey
mpl.rcParams['axes.labelcolor'] = 'gray'

# Set the style of the plot
mpl.rcParams['axes.facecolor'] = 'white'  # Background color
mpl.rcParams['savefig.facecolor'] = 'white'  # Background color for saved figures
mpl.rcParams['axes.titleweight'] = 'bold'  # Bold titles for emphasis

# Adjust legend properties
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['legend.loc'] = 'best'
mpl.rcParams['legend.frameon'] = False
mpl.rcParams['legend.framealpha'] = 0.8  # Slightly transparent
mpl.rcParams['legend.labelcolor'] = 'gray'  # Set legend font color to gray

# Tight layout to make better use of space
mpl.rcParams['figure.autolayout'] = True

# Use a specific colormap suitable for presentations
mpl.rcParams['image.cmap'] = 'viridis'

In [None]:
# plotting functions

def plot_popularity_cat_bars(data):
    '''Creates a bar plot of the popularity categories introduced in the eda for classification.'''

    # create subplots area
    fig, ax = plt.subplots(
        ncols=1,
        nrows=1,
        figsize=FIGSIZE_LARGE,
        dpi=DPI,
    )
    
    # create countplot as barplot
    sns.countplot(
        data=data, 
        x='popularity_cat', 
        order=['Unknown', 'Low', 'Medium', 'High'],
        ax=ax
    )

    # additional configurations
    ax.set_title('Popularity Categories', pad=20)
    ax.set_ylabel('Datapoints', labelpad=15)
    ax.set_xlabel('', labelpad=15)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    return fig

#########################
def plot_popularity_correlation_positive(data):
    '''Creates a correlation barplot of only the positive correlatiosn (above 0) 
    focused on the numeric popularity column.'''

    # create subplots area
    fig, ax = plt.subplots(
        ncols=1,
        nrows=1,
        figsize=FIGSIZE_SMALL,
        dpi=DPI,
    )
    
    # create correlation matrix
    pop_corr = data[NUM_COLS_EDA].corr()['popularity'].drop('popularity')

    # Create a DataFrame from the series
    corr_df = pop_corr.reset_index()
    corr_df.columns = ['feature', 'correlation']
    
    # Map readable labels using LABEL_MAP
    corr_df['label'] = corr_df['feature'].map(LABEL_MAP).fillna(corr_df['feature'])
    
    # get only positive correlation
    corr_pos = corr_df[corr_df['correlation'] > 0].sort_values(by='correlation', ascending=False)

    # create barplot (horizontal)
    sns.barplot(
        data=corr_pos,
        x='correlation',
        y='label',
        #color='#1f77b4',  # standard matplotlib blue
        ax=ax
    )

    # additional configurations
    ax.set_title('Positive correlation with Popularity', pad=20)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    return fig

#########################
def plot_popularity_correlation_negative(data):
    '''Creates a correlation barplot of only the negative correlatiosn (above 0) 
    focused on the numeric popularity column.'''

    # create subplots area
    fig, ax = plt.subplots(
        ncols=1,
        nrows=1,
        figsize=FIGSIZE_SMALL,
        dpi=DPI,
    )

    # create correlation matrix
    pop_corr = data[NUM_COLS_EDA].corr()['popularity'].drop('popularity')

    # Create a DataFrame from the series
    corr_df = pop_corr.reset_index()
    corr_df.columns = ['feature', 'correlation']
    
    # Map readable labels using LABEL_MAP
    corr_df['label'] = corr_df['feature'].map(LABEL_MAP).fillna(corr_df['feature'])
    
    # get only negative correlation
    corr_neg = corr_df[corr_df['correlation'] < 0].sort_values(by='correlation')

    # create barplot (horizontal)
    sns.barplot(
        data=corr_neg,
        x='correlation',
        y='label',
        #color='#1f77b4',  # standard matplotlib blue
        ax=ax
    )
    
    # additional configurations
    ax.set_title('Negative correlation with Popularity', pad=20)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    return fig

#########################
def plot_feature_importances_final_model(data, top_n):
    '''Uses the feature importances of the final model given as data frame to 
    plot the top n (e.g. 20) features as horizontal barplot.'''

    # create subplots area
    fig, ax = plt.subplots(
        ncols=1,
        nrows=1,
        figsize=FIGSIZE_LARGE,
        dpi=DPI,
    )    

    # create a label column to map the label dict
    data['label'] = data['feature'].map(LABEL_MAP)

    # for features not in the map dict, use old name
    data['label'] = data['label'].fillna(data['feature'])
    
    # create horizontal barplot
    sns.barplot(
        data=data.head(top_n),
        y='label',
        x='importance',
        hue='feature',
        legend=False,
        ax=ax,
        palette=['#1f77b4'], # solid blue
        orient='h'
    )

    # additional configurations
    ax.set_title('Top 10 Predictors of Track Popularity', pad=15)
    ax.set_xlabel('Relative Importance', labelpad=10)
    ax.set_ylabel('Feature label')
    sns.despine(left=True, bottom=True)

    return fig

In [None]:
print("load data")
data = pd.read_csv('data/spotify_dataset.csv')
data_clean = clean_data(data)

In [None]:
### the following could take a while (model training)

print("load data")
data = pd.read_csv('data/spotify_dataset.csv')
data_clean = clean_data(data)

print("prepare data for model")
# prepare data for model
features_train, target_train, features_test, target_test, features_val, target_val = prep_data_for_model(data)

# CAT_COLS and NUM_COLS for feature engineering / one-hot-encoding
CAT_COLS_FINAL = ['key', 'time_signature']
NUM_COLS_FINAL = [col for col in features_train.columns if col not in CAT_COLS_FINAL]

print("train model (could take a while)")
# use final model pipeline
pipeline_final = final_pipeline(NUM_COLS_FINAL, CAT_COLS_FINAL)
pipeline_final.fit(features_train, target_train)

print("model training completed; get feature importances of model")
# get feature importances of pipeline as dataframe
df_feature_importances = get_feature_importances(pipeline_final)

In [None]:
# plot 1
print("create popularity categories bar plot of cleaned data")
fig = plot_popularity_cat_bars(data_clean)
fig.savefig('plots/plot_distribution_of_popularity_categories.svg', bbox_inches='tight')

In [None]:
# plot 2
print("create positive correlation barplots focused on popularity")
fig = plot_popularity_correlation_positive(data_clean)
fig.savefig('plots/plot_positive_correlations_with_popularity.svg', bbox_inches='tight')

In [None]:
# plot 3
print("create negative correlation barplots focused on popularity")
fig = plot_popularity_correlation_negative(data_clean)
fig.savefig('plots/plot_negative_correlations_with_popularity.svg', bbox_inches='tight')

In [None]:
# plot 4
print("show top 15 feature importances of final model")
fig = plot_feature_importances_final_model(df_feature_importances, 10)
fig.savefig('plots/plot_feature_importances_final_model.svg', bbox_inches='tight')

### The following plotting is only for interest in model specific data (classification_report, learning curve)

In [None]:
# classification report as heatmap

from sklearn.metrics import classification_report

target_val_pred = pipeline_final.predict(features_val)

def plot_classification_report(target_val, target_val_pred, labels=None):
    # get report as dict
    report_dict = classification_report(target_val, target_val_pred, output_dict=True, target_names=labels)

    # transform into dataframe
    report_df = pd.DataFrame(report_dict).transpose()

    # optional: no accuracy, marco avg. etc.
    class_df = report_df.iloc[:-3, :]

    # create subplots area
    fig, ax = plt.subplots(
        ncols=1,
        nrows=1,
        figsize=FIGSIZE_SMALL,
        dpi=DPI,
    )

    # plotting
    sns.heatmap(class_df[['precision', 'recall', 'f1-score']], 
                annot=True, fmt=".2f", cmap="Blues", center=0, cbar=False, ax=ax)

    ax.set_title("Classification Report", pad=10)
    ax.set_xlabel("Metric")
    ax.set_ylabel("Class")

fig_class_rep = plot_classification_report(target_val, target_val_pred)
#fig_class_rep.savefig('plots/plot_feature_importances_final_model.svg', bbox_inches='tight')
    

In [None]:
# classification report as nice table
report_dict = classification_report(target_val, target_val_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).drop(['macro avg', 'weighted avg'], axis=1).transpose()
# report_df.round(2)

# table as plot
fig_table, ax = plt.subplots(figsize=FIGSIZE_SMALL)

ax.axis('off')
table = ax.table(cellText=report_df.round(2).values,
                     colLabels=report_df.columns,
                     rowLabels=report_df.index,
                     loc='center')

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2)

fig_table.savefig('plots/plot_classification_report_table.svg', bbox_inches='tight')

In [None]:
# Computing learning curve (could take some time)
from sklearn.model_selection import learning_curve
import numpy as np

train_sizes, train_scores, test_scores = learning_curve(estimator=pipeline_final, 
                                                        X=features_train, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='f1_weighted',
                                                        n_jobs=-1,
                                                        train_sizes=np.linspace(0.1, 1.0, 5))

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
fig_lc, ax = plt.subplots(figsize=FIGSIZE_LARGE)
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve", pad=20)
ax.set_xlabel("Training Set Size")
ax.set_ylabel("F1-Score (weighted)")
ax.legend(loc="best")
fig_lc;

In [None]:
fig_lc.savefig('plots/plot_learning_curve_model.svg', bbox_inches='tight')