# Feature Engineering (Spotify Track Data)  

### Recategorize "popularity" target column for classifier models

In [None]:
# importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

In [None]:
df = pd.read_csv('data/spotify_cleaned.csv', index_col=0)
df.head()

### Before train-test-split: 
Popularity as target column needs to be **recategorized** for classification models:
- "New" = values of -1 or 0, represents new songs or songs that have not been played on spotify yet for some reason
- "Low", "Medium", "High" = by cutting 1/3rd (at 33, 66 and 100) the value ranges represent low, medium and high popularity

In [None]:
# Use pd.cut for equal-width bins

df['popularity_cat'] = pd.cut(df['popularity'],
                                    bins=[-1, 0, 33, 66, 100],
                                    labels=['New', 'Low', 'Medium', 'High'])

print(df['popularity_cat'].value_counts())
print(df.columns)


The categorization is unbalanced considering medium and low are dominating the popularity values (which has already been seen in the eda), but
a balance class weight in a classification model probably fixes that. The categorization shouldn't become too complex. Will work with that for now.

In [None]:
# redefine CAT_COLS for later one hot encoding (some may need to be dropped or go through a PCA first)

CAT_COLS_NEW = [
    'artists',
    'album_name',
    'track_name',
    'explicit',
    'track_genre',
    'key',
    'mode',
    'time_signature']

# check unique value count again
print(f'{"columns":<20}{"# unique values"}')
print('-'*40)
for col in CAT_COLS_NEW:
    print(f'{col:<20}{df[col].nunique()}')

### Notes for categorical columns & further feature engineering:
- feature engineer new columns out of the 3 columns with too many unique values: "artists", "album_name", "track_name"
    - Artist/Album Popularity: The popularity of an artist or album is derived from track popularity, which means tracks by highly popular artists or albums will tend to have higher popularity
    - Convert track_name length to numeric since shorter or longer names can have an impact on popularity (track name like "Burn" vs. "Fantasy on a long road", etc.) and this way track_name data is somewhat usable for the model
- use a pca for "track_genre" since 113 values is a bit too much for one hot encoding

### Train-Test-Splits

In [None]:
# train-test-split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print(df_train.shape)
print(df_test.shape)

In [None]:
# second train-test-split for val/aim data
df_test, df_aim = train_test_split(df_test, test_size = 0.33, random_state = 42)
print(df_test.shape)
print(df_aim.shape)

### New features: Artist & Album Popularity

In [None]:
# use mean track popularity per artist
artist_popularity = df_train.groupby('artists')['popularity'].mean().to_dict()
df_train['artist_popularity'] = df_train['artists'].map(artist_popularity)

df_train['artist_popularity'].value_counts()

In [None]:
# use mean track popularity per album
artist_popularity = df_train.groupby('album_name')['popularity'].mean().to_dict()
df_train['album_popularity'] = df_train['album_name'].map(artist_popularity)

df_train['album_popularity'].value_counts()

### New feature: Track name length

In [None]:
# get lenght of track name for new column
df_train['track_name_length'] = df_train['track_name'].str.len()

df_train['track_name_length'].value_counts()

### PCA for track genres to reduce dimensionality

In [None]:
# one-hot encode 'track_genre'
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
genre_encoded = ohe.fit_transform(df_train[['track_genre']])

# apply pca
n_variance = 0.90 # Keep up to 90% of the variance, test also with lower values

pca = PCA(n_components=n_variance)  
genre_pca = pca.fit_transform(genre_encoded)

print(f"Number of PCA components: {pca.n_components_}")

# create a DataFrame for PCA results
genre_pca_df = pd.DataFrame(genre_pca, columns=[f'genre_pca_{i}' for i in range(pca.n_components_)], index=df_train.index)
genre_pca_df.head()

In [None]:
# plot the explained variance ratio
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
ax.set_xlabel('Number of PCA Components')
ax.set_ylabel('Cumulative Explained Variance')
ax.set_title('Scree Plot: PCA on track_genre')
ax.grid(True)
ax.axhline(y=n_variance, color='r', linestyle='--', label=f'{n_variance*100}% threshold')
ax.legend();

The pca for track_genre has not been very helpful. Let's try something else: Use mean of popularity (like for album popularity, for example) for each genre to create numerical values as a feature.

In [None]:
# use mean track popularity per genre
genre_popularity = df_train.groupby('track_genre')['popularity'].mean()
df_train['track_genre_pop'] = df_train['track_genre'].map(genre_popularity)

df_train['track_genre_pop'].value_counts()

In [None]:
#

-----------------------------------------------------------------
Creating Classes / Functions for the engineered features to later use on df_test and val as well:

### to do: class creation for later pipeline (maybe in a bigger class even with subclasses?)
from sklearn.base import BaseEstimator, TransformerMixin

class ArtistPopularityTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.artist_pop_map_ = X.groupby('artists')['popularity'].mean()
        return self

    def transform(self, X):
        X = X.copy()
        X['artist_popularity'] = X['artists'].map(self.artist_pop_map_)
        return X

transformer = ArtistPopularityTransformer()
df_train = transformer.fit_transform(df_train)
df_test = transformer.transform(df_test)

### or just as functions (first fit, then transform):
def fit_artist_popularity(df_train):
    """Creates a mapping of artist to their mean popularity."""
    return df_train.groupby('artists')['popularity'].mean()

def add_artist_popularity_feature(df, artist_popularity_map):
    """Adds a new column to the DataFrame using the fitted artist popularity map."""
    df = df.copy()
    df['artist_popularity'] = df['artists'].map(artist_popularity_map)
    return df

Fit on train only
artist_popularity_map = fit_artist_popularity(df_train)

Apply to both train and test (without re-fitting)
df_train = add_artist_popularity_feature(df_train, artist_popularity_map)
df_test = add_artist_popularity_feature(df_test, artist_popularity_map)
