This file is the first attempt at encoding the music dataset into a form more compatible with ML models.
Some of the transformation steps taken include:

Transformation steps considered include:
- One-hot encoding of all genres present
- 	too big! Try TF/IDF???

In [127]:
import pathlib
import pandas as pd
REPO_ROOT = pathlib.Path.cwd().parent

In [128]:
# importing data
music_dataset = pd.read_csv(REPO_ROOT / "data/song_data/song_dataset_with_jojos_music.csv")
music_dataset = music_dataset.dropna()

In [129]:
from sklearn.preprocessing import OneHotEncoder
# Encoding categorical variables: mode, key. time_signature
key_encoder = OneHotEncoder(sparse=False)
key_encoder.fit(music_dataset[['key']])
print(key_encoder.categories_)
mode_encoder = OneHotEncoder(sparse=False)
mode_encoder.fit(music_dataset[['mode']])
print(mode_encoder.categories_)
time_signature_encoder = OneHotEncoder(sparse=False)
time_signature_encoder.fit(music_dataset[['time_signature']])
print(time_signature_encoder.categories_)

[array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])]
[array([0., 1.])]
[array([0., 1., 3., 4., 5.])]


In [130]:
music_dataset[key_encoder.get_feature_names_out()] = key_encoder.transform(music_dataset[['key']])
music_dataset[mode_encoder.get_feature_names_out()] = mode_encoder.transform(music_dataset[['mode']])
music_dataset[time_signature_encoder.get_feature_names_out()] = time_signature_encoder.transform(music_dataset[['time_signature']])
# And also drop the previous version
music_dataset = music_dataset.drop(columns=['key', 'mode', 'time_signature'])

In [131]:
from sklearn.preprocessing import MinMaxScaler
# Normalizing all continuous features (duration_ms, danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, artist_popularity
for column in ['duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity']:
	scaler = MinMaxScaler()
	scaler.fit(music_dataset[[column]])
	music_dataset[[column]] = scaler.transform(music_dataset[[column]])

In [None]:
music_dataset.head(10)

In [132]:
def fit_scalers_and_encoders(df_in: pd.DataFrame) -> dict:
	scalers_and_encoders = {}
	# Creating onehotencoders for categorical features
	for column in ['key', 'mode', 'time_signature']:
		encoder = OneHotEncoder(sparse=False)
		encoder.fit(df_in[[column]])
		scalers_and_encoders[column] = encoder
	# Creating scalers for continuous features
	for column in ['duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity']:
		scaler = MinMaxScaler()
		scaler.fit(df_in[[column]])
		scalers_and_encoders[column] = scaler
	return scalers_and_encoders

# Scalers is a dictionary mapping of column names (as strings) to scalers or encoders.
def encode_dataframe_given_scalers(df_in: pd.DataFrame, scalers: dict) -> None:
	# Encoding categorical variables
	for column in ['key', 'mode', 'time_signature']:
		df_in[scalers[column].get_feature_names_out()] = scalers[column].transform(df_in[[column]])

	# Normalizing continuous features
	for column in ['duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity']:
		df_in[[column]] = scalers[column].transform(df_in[[column]])

Unnamed: 0,track_id,artist_name,track_name,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,...,key_9.0,key_10.0,key_11.0,mode_0.0,mode_1.0,time_signature_0.0,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,0.040604,0.678715,0.461,0.824174,0.148187,0.032329,1e-06,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,0.026148,0.421687,0.166,0.661843,0.079067,0.927711,6e-06,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson,To Begin Again,0.037065,0.439759,0.359,0.777931,0.05772,0.210843,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,0.035479,0.267068,0.0596,0.642034,0.037617,0.908635,7.1e-05,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,0.03493,0.620482,0.443,0.778751,0.054508,0.470884,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,0.037674,0.690763,0.481,0.792277,0.108808,0.290161,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6,6Vc5wAMmXdKIAM7WUoEb7N,A Great Big World,Say Something,0.040378,0.408635,0.147,0.792045,0.036788,0.860442,3e-06,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,I'm Yours,0.042794,0.705823,0.444,0.784168,0.043212,0.561245,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8,0IktbUcnAGrvD03AWnz3Q8,Jason Mraz,Lucky,0.033282,0.62751,0.414,0.793933,0.038238,0.295181,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,0.036132,0.443775,0.632,0.823803,0.03057,0.427711,0.00419,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Genre transformation?
music_dataset['genres'] = music_dataset['genres'].apply(lambda x: x.split(" "))
tfidf = TfidfVectorizer()
tfidf_matrix =  tfidf.fit_transform(music_dataset['genres'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
genre_df.reset_index(drop = True, inplace=True)
genre_df.iloc[0]



genre|150        0.0
genre|21st       0.0
genre|420        0.0
genre|432hz      0.0
genre|48g        0.0
                ... 
genre|zouglou    0.0
genre|zouk       0.0
genre|zuliana    0.0
genre|zxc        0.0
genre|zydeco     0.0
Name: 0, Length: 2146, dtype: float64

Below this line is EDA :/

In [134]:
# How many genres are there, and would one-hot encoding on these cause a state space explosion?

# A ton of cleaning :/
genres = music_dataset['genres'].dropna().values
unique_genre_pairs = {genre for genre in genres}
unique_genre_pairs = {genre_pair[1:-1] for genre_pair in unique_genre_pairs}
unique_genre_pairs = {genre_pair.replace("'", "") for genre_pair in unique_genre_pairs}
unique_genre_pairs = {tuple(genre_pair.split(",")) for genre_pair in unique_genre_pairs}
unique_genres = {genre for genres in unique_genre_pairs for genre in genres}
unique_genres = {genre.strip() for genre in unique_genres}
len(unique_genres)

TypeError: unhashable type: 'list'

In [None]:
# One-hot encoding genres would be the same as adding another ~4400 boolean columns. This is obviously not ideal as we have finite compute power.
# How else can we encode lists of categorical variables?