# Feature Engineering (Spotify Track Data)  

### Recategorize "popularity" target column for classifier models

In [None]:
# importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

### Train-Test-Splits

In [None]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

### Data cleaning

In [None]:
from src.features.clean_data_func import clean_data

#apply clean_data function on train data
df_train_cleaned = clean_data(df_train)
display(df_train_cleaned.head())

#apply clean_data function on test and val data
df_test_cleaned = clean_data(df_test)
df_val_cleaned = clean_data(df_val)

In [None]:
# redefine CAT_COLS for later one hot encoding (some may need to be dropped/changed or go through a PCA first)

CAT_COLS_NEW = [
    'artists',
    'album_name',
    'track_name',
    'explicit',
    'track_genre',
    'key',
    'mode',
    'time_signature']

# check unique value count again
print(f'{"columns":<20}{"# unique values"}')
print('-'*40)
for col in CAT_COLS_NEW:
    print(f'{col:<20}{df_train_cleaned[col].nunique()}')

### Notes for categorical columns & further feature engineering:
- feature engineer new columns out of the 3 columns with too many unique values: "artists", "album_name", "track_name"
    - Get tracks per artist by grouping artist and track_id; many tracks where an artist contributed could mean that artist has a higher exposure and could lead to higher popularity
    - Convert track_name length to numeric since shorter or longer names may have an impact on popularity (track name like "Burn" vs. "Fantasy on a long road", etc.) and this way track_name data is somewhat usable for the model
    - Convert album_name length to numeric aswell
    - Instead of tracks per genre, which led to overfitting when testing, try grouping the genres into categories of more common and less common groups and then one hot encode?

### New features

In [None]:
# tracks per artist (how often they appear)
df_train_cleaned['tracks_per_artist'] = df_train_cleaned.groupby('artists')['track_id'].transform('count')
print(df_train_cleaned['tracks_per_artist'], '\n')

# tracks per album (less useful, but possible) ### abondoned because it led to overfitting in the model ###
#df_train_cleaned['tracks_per_album'] = df_train_cleaned.groupby('album_name')['track_id'].transform('count')
#print(df_train_cleaned['tracks_per_album'], '\n')

# tracks per genre (how often they appear) ### abondoned because it led to overfitting in the model ###
#df_train_cleaned['tracks_per_genre'] = df_train_cleaned.groupby('track_genre')['track_id'].transform('count')
#print(df_train_cleaned['tracks_per_genre'])

In [None]:
# build groups for the top genres at a chosen cutoff point and the rest is tagged as "Others" to reduce categories for later one-hot-encoding
# try different cutoffs, median is around 589, if output has still too many nunique genres for one-hot-encoding, try higher cutoff (but lose data variance)
cutoff = 589   # cutoff at 650 leads to 33 nunique genres (the 'Other' group for everything under it included as one big group)
genre_counts = df_train_cleaned['track_genre'].value_counts()
common_genres = genre_counts[genre_counts >= cutoff].index

df_train_cleaned['track_genre_grouped'] = df_train_cleaned['track_genre'].where(df_train_cleaned['track_genre'].isin(common_genres), 'Other')
df_train_cleaned['track_genre_grouped'].nunique()
df_train_cleaned['track_genre_grouped'].value_counts()


In [None]:
# get length of track name
df_train_cleaned['track_name_length'] = df_train_cleaned['track_name'].str.len()
print(df_train_cleaned['track_name_length'], '\n')

# get length of album name 
df_train_cleaned['album_name_length'] = df_train_cleaned['album_name'].str.len()
print(df_train_cleaned['album_name_length'])

### PCA test for track genres to reduce dimensionality (optional, not helpful)

In [None]:
# one-hot encode 'track_genre'
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
genre_encoded = ohe.fit_transform(df_train_cleaned[['track_genre']])

# apply pca
n_variance = 0.90 # Keep up to 90% of the variance, test also with lower values

pca = PCA(n_components=n_variance)  
genre_pca = pca.fit_transform(genre_encoded)

print(f"Number of PCA components: {pca.n_components_}")

# create a DataFrame for PCA results
genre_pca_df = pd.DataFrame(genre_pca, columns=[f'genre_pca_{i}' for i in range(pca.n_components_)], index=df_train_cleaned.index)
genre_pca_df.head()

In [None]:
# plot the explained variance ratio
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
ax.set_xlabel('Number of PCA Components')
ax.set_ylabel('Cumulative Explained Variance')
ax.set_title('Scree Plot: PCA on track_genre')
ax.grid(True)
ax.axhline(y=n_variance, color='r', linestyle='--', label=f'{n_variance*100}% threshold')
ax.legend();

The pca for track_genre has not been very helpful. Will not be used.