# Feature Engineering (Spotify Track Data)  

### Recategorize "popularity" target column for classifier models

In [None]:
# importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

In [None]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

### Train-Test-Splits

In [None]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

### Data cleaning

In [None]:
from src.features.clean_data_func import clean_data

#apply clean_data function on train data
df_train_cleaned = clean_data(df_train)
display(df_train_cleaned.head())

#apply clean_data function on test and val data
df_test_cleaned = clean_data(df_test)
df_val_cleaned = clean_data(df_val)

In [None]:
# redefine CAT_COLS for later one hot encoding (some may need to be dropped/changed or go through a PCA first)

CAT_COLS_NEW = [
    'artists',
    'album_name',
    'track_name',
    'explicit',
    'track_genre',
    'key',
    'mode',
    'time_signature']

# check unique value count again
print(f'{"columns":<20}{"# unique values"}')
print('-'*40)
for col in CAT_COLS_NEW:
    print(f'{col:<20}{df_train_cleaned[col].nunique()}')

### Notes for categorical columns & further feature engineering:
- feature engineer new columns out of the 3 columns with too many unique values: "artists", "album_name", "track_name"
    - Artist/Album Popularity: The popularity of an artist or album is derived from track popularity, which means tracks by highly popular artists or albums will tend to have higher popularity
    - Convert track_name length to numeric since shorter or longer names can have an impact on popularity (track name like "Burn" vs. "Fantasy on a long road", etc.) and this way track_name data is somewhat usable for the model
- try a pca for "track_genre" since 113 values is a bit too much for one hot encoding, or group it by popularity as well if pca is not helping

### New features: Artist & Album Popularity

In [None]:
# use mean track popularity per artist
artist_popularity = df_train_cleaned.groupby('artists')['popularity'].mean().to_dict()
df_train_cleaned['artist_popularity'] = df_train_cleaned['artists'].map(artist_popularity)

df_train_cleaned['artist_popularity'].value_counts()

In [None]:
# use mean track popularity per album
album_popularity = df_train_cleaned.groupby('album_name')['popularity'].mean().to_dict()
df_train_cleaned['album_popularity'] = df_train_cleaned['album_name'].map(album_popularity)

df_train_cleaned['album_popularity'].value_counts()

### New feature: Track name length

In [None]:
# get length of track name for new column
df_train_cleaned['track_name_length'] = df_train_cleaned['track_name'].str.len()

df_train_cleaned['track_name_length'].value_counts()

### PCA for track genres to reduce dimensionality

In [None]:
# one-hot encode 'track_genre'
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
genre_encoded = ohe.fit_transform(df_train_cleaned[['track_genre']])

# apply pca
n_variance = 0.90 # Keep up to 90% of the variance, test also with lower values

pca = PCA(n_components=n_variance)  
genre_pca = pca.fit_transform(genre_encoded)

print(f"Number of PCA components: {pca.n_components_}")

# create a DataFrame for PCA results
genre_pca_df = pd.DataFrame(genre_pca, columns=[f'genre_pca_{i}' for i in range(pca.n_components_)], index=df_train_cleaned.index)
genre_pca_df.head()

In [None]:
# plot the explained variance ratio
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
ax.set_xlabel('Number of PCA Components')
ax.set_ylabel('Cumulative Explained Variance')
ax.set_title('Scree Plot: PCA on track_genre')
ax.grid(True)
ax.axhline(y=n_variance, color='r', linestyle='--', label=f'{n_variance*100}% threshold')
ax.legend();

The pca for track_genre has not been very helpful. Let's try something else: Use **mean of popularity** (like for album popularity, for example) **for each genre** to create numerical values as a feature.

In [None]:
# use mean track popularity per genre
genre_popularity = df_train_cleaned.groupby('track_genre')['popularity'].mean()
df_train_cleaned['track_genre_pop'] = df_train_cleaned['track_genre'].map(genre_popularity)

df_train_cleaned['track_genre_pop'].value_counts()

In [None]:
# possible feature columns to drop for later features dataframe
features_to_drop = [
    'track_id',
    'artists',
    'album_name',
    'track_name',
    'track_genre',
    'popularity',
    'popularity_cat']

## 'popularity' is needed for the fit and transform of the engineered features for df_train, df_test and df_aim, but not for the final features dataframe column transformer