In [1]:
import polars as pl
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [3]:
# Loading Clean Data
df = pl.read_parquet("../data/processed/songs_with_mood.parquet")

In [4]:
# Selecting Important Features
dense_features = [
    "danceability", "energy", "speechiness", 
    "acousticness", "instrumentalness", "liveness", 
    "valence", "tempo" 
]

### Normalize Dense Features

In [5]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df.select(dense_features).to_numpy())


In [6]:
# Create a new DataFrame with scaled columns
df_scaled = df.with_columns([
    pl.Series(name, scaled_data[:, i]) for i, name in enumerate(dense_features)
])

### Encode Genres

In [7]:
le = LabelEncoder()
genres = df["genre"].to_numpy()
genre_ids = le.fit_transform(genres)

# Add encoded genre to dataframe
df_final = df_scaled.with_columns(pl.Series("genre_id", genre_ids))

print(f"Total Genres: {len(le.classes_)}")
print("Data scaled and encoded. Ready for PyTorch.")

Total Genres: 82
Data scaled and encoded. Ready for PyTorch.


### Save 

In [10]:
df_final.write_parquet("../data/processed/songs_ready_for_model.parquet")

with open("../data/processed/feature_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
    
with open("../data/processed/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [12]:
df = pl.read_parquet("../data/processed/songs_ready_for_model.parquet")

In [13]:
mood_encoder = LabelEncoder()
mood_labels = df["mood_label"].to_numpy()
mood_ids = mood_encoder.fit_transform(mood_labels)

In [None]:
df_final = df.with_columns(pl.Series("mood_id", mood_ids))

In [19]:
df_final.write_parquet("../data/processed/songs_final.parquet")

In [20]:
with open("../data/processed/mood_encoder.pkl", "wb") as f:
    pickle.dump(mood_encoder, f)

print("Updated data with 'mood_id'. Classes:", mood_encoder.classes_)

Updated data with 'mood_id'. Classes: ['Angry' 'Calm' 'Happy' 'Sad']
