In [1]:
import pandas as pd
import json
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.options.display.max_columns = None

In [2]:
## Function to extract artist names
# text: str - string representation of list of dicts of length equal to number of artists
def get_artist(text):
    artists = ast.literal_eval(text)  # list (artists) containing (artists) dictionaries

    return ", ".join(artist['name'] for artist in artists)

# example to test
get_artist("[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Cs0zKBU1kc0i8ypK3B9ai'}, 'href': 'https://api.spotify.com/v1/artists/1Cs0zKBU1kc0i8ypK3B9ai', 'id': '1Cs0zKBU1kc0i8ypK3B9ai', 'name': 'David Guetta', 'type': 'artist', 'uri': 'spotify:artist:1Cs0zKBU1kc0i8ypK3B9ai'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/64M6ah0SkkRsnPGtGiRAbb'}, 'href': 'https://api.spotify.com/v1/artists/64M6ah0SkkRsnPGtGiRAbb', 'id': '64M6ah0SkkRsnPGtGiRAbb', 'name': 'Bebe Rexha', 'type': 'artist', 'uri': 'spotify:artist:64M6ah0SkkRsnPGtGiRAbb'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/1vyhD5VmyZ7KMfW5gqLgo5'}, 'href': 'https://api.spotify.com/v1/artists/1vyhD5VmyZ7KMfW5gqLgo5', 'id': '1vyhD5VmyZ7KMfW5gqLgo5', 'name': 'J Balvin', 'type': 'artist', 'uri': 'spotify:artist:1vyhD5VmyZ7KMfW5gqLgo5'}]")

'David Guetta, Bebe Rexha, J Balvin'

In [3]:
## Load and clean the data
# index_col=0 : consider first column as index
# sort by position in ascending order and reset index
df = pd.read_csv('played_out.csv', index_col=0).sort_values(by='position', ascending=True).reset_index(drop=True)

# track.type : always same value 'track'
# track.is_local : always same value 'local'
# track.track_number : position on its original album (not relevant for our analysis)
df.drop(columns = ['track.type', 'track.is_local', 'track.track_number'], inplace=True)

# extract artist names from 'track.artists' column
df['artist'] = df['track.artists'].apply(get_artist)

# drop the original 'track.artists' column
df.drop(columns = ['track.artists'], inplace=True)

df.rename(columns={'track.duration_ms':'duration_ms', 'track.explicit':'explicit', 'track.id':'track_id', 'track.name':'track_name', 'track.popularity':'popularity',
                'accousticness':'acousticness'}, inplace=True)

In [4]:
df.to_csv('played_out_cleaned.csv', index=False)

print(df.track_id.nunique())

682


In [9]:
# Imputation by Median since, since the marginal distribution is skewed groupwise

from sklearn.discriminant_analysis import StandardScaler
scaler = StandardScaler()

continuous_features = ['popularity','acousticness','danceability','energy','instrumentalness',
                      'liveness','loudness','speechiness','tempo','valence']
df[continuous_features] = scaler.fit_transform(df[continuous_features])

imputation_values = df.groupby('first_genre').agg({'popularity':'median',
                                                        'acousticness':'median',
                                                        'danceability':'median',
                                                        'energy':'median',
                                                        'instrumentalness':'median',
                                                        'liveness':'median',
                                                        'loudness':'median',
                                                        'speechiness':'median',
                                                        'tempo':'median',
                                                        'valence':'median'}).reset_index().set_index('first_genre')

imputation_values


# Categorical imputation (mode per genre)
categorical_features = ['key', 'mode']

# Get genre-wise MODE (most frequent)
cat_imputation = df.groupby('first_genre')[categorical_features].agg(lambda x: x.mode()[0]).reset_index().set_index('first_genre')

In [18]:
for feature in continuous_features:
    if feature in imputation_values.columns:
        df[feature] = df[feature].fillna(df['first_genre'].map(imputation_values[feature]))

for feature in categorical_features:
    if feature in cat_imputation.columns:
        df[feature] = df[feature].fillna(df['first_genre'].map(cat_imputation[feature]))

df.head()


Unnamed: 0,position,user,first_genre,track_name,track_id,artist,duration_ms,gender,age,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode
0,0,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Subzero - Original Mix,7p6oXzBSPAXXz8Xb8gBPki,Ben Klock,383972,M,40,-1.878208,0.763834,1.102906,0.096791,1.960773,-0.523824,-0.328856,-0.086877,0.078965,-1.36501,6.0,0.0
1,1,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,I'm Good (Blue),4uUG5RXrOk84mYEfFvj3cK,"David Guetta, Bebe Rexha",175238,M,40,0.696076,-0.84525,-0.141773,1.439137,-0.515218,1.598218,0.687247,-0.611993,0.185342,-0.592371,7.0,0.0
2,2,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,I Don't Wanna Wait,331l3xABO0HMr1Kkyh2LZq,"David Guetta, OneRepublic",149667,M,40,0.438647,-0.752507,0.502027,0.495359,-0.515238,0.513393,0.56682,-0.666096,0.25282,0.446122,1.0,0.0
3,3,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Love Tonight (David Guetta Remix Edit),2prnn41CblB8B4yWACDljP,"Shouse, David Guetta",158095,M,40,0.116862,-0.814482,0.180127,1.529379,0.511401,0.138778,0.616828,-0.467188,0.114308,-1.71062,10.0,0.0
4,4,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Wide Open - Len Faki DjEdit,477I4wif0etzeupmlQzTxl,"DJ Hyperactive, Len Faki",435571,M,40,-2.199993,-0.855586,1.601851,0.126872,2.02979,-0.526945,0.249424,0.170907,0.18447,-1.45972,7.0,1.0


In [21]:
# Make sure this cell runs AFTER any place where genre_mapping was a list
genre_mapping = {
    1: 'Pop & Mainstream',
    2: 'Urban & Contemporary',
    3: 'Electronic & Beat-Based',
    4: 'Rock & Heavy',
    5: 'Roots, Jazz & Classical Traditions'
}


df['genre_label'] = df['first_genre'].map(genre_mapping)

print(df[['first_genre', 'genre_label']])

      first_genre              genre_label
0               3  Electronic & Beat-Based
1               3  Electronic & Beat-Based
2               3  Electronic & Beat-Based
3               3  Electronic & Beat-Based
4               3  Electronic & Beat-Based
...           ...                      ...
4567            1         Pop & Mainstream
4568            1         Pop & Mainstream
4569            1         Pop & Mainstream
4570            1         Pop & Mainstream
4571            1         Pop & Mainstream

[4572 rows x 2 columns]


In [22]:
# FIXED: Single flat list - NO nested brackets
base_columns = ['position', 'user', 'first_genre', 'genre_label','track_name', 'track_id', 'artist', 'duration_ms', 'gender', 'age']
all_columns = base_columns + continuous_features + categorical_features

df = df[all_columns].sort_values(
    by=['user', 'position', 'first_genre']
).reset_index(drop=True)

df.head()

df.to_csv('played_out_imputed.csv',index=False)


In [23]:
df.head()

Unnamed: 0,position,user,first_genre,genre_label,track_name,track_id,artist,duration_ms,gender,age,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode
0,0,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Electronic & Beat-Based,Subzero - Original Mix,7p6oXzBSPAXXz8Xb8gBPki,Ben Klock,383972,M,40,-1.878208,0.763834,1.102906,0.096791,1.960773,-0.523824,-0.328856,-0.086877,0.078965,-1.36501,6.0,0.0
1,1,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Electronic & Beat-Based,I'm Good (Blue),4uUG5RXrOk84mYEfFvj3cK,"David Guetta, Bebe Rexha",175238,M,40,0.696076,-0.84525,-0.141773,1.439137,-0.515218,1.598218,0.687247,-0.611993,0.185342,-0.592371,7.0,0.0
2,2,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Electronic & Beat-Based,I Don't Wanna Wait,331l3xABO0HMr1Kkyh2LZq,"David Guetta, OneRepublic",149667,M,40,0.438647,-0.752507,0.502027,0.495359,-0.515238,0.513393,0.56682,-0.666096,0.25282,0.446122,1.0,0.0
3,3,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Electronic & Beat-Based,Love Tonight (David Guetta Remix Edit),2prnn41CblB8B4yWACDljP,"Shouse, David Guetta",158095,M,40,0.116862,-0.814482,0.180127,1.529379,0.511401,0.138778,0.616828,-0.467188,0.114308,-1.71062,10.0,0.0
4,4,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Electronic & Beat-Based,Wide Open - Len Faki DjEdit,477I4wif0etzeupmlQzTxl,"DJ Hyperactive, Len Faki",435571,M,40,-2.199993,-0.855586,1.601851,0.126872,2.02979,-0.526945,0.249424,0.170907,0.18447,-1.45972,7.0,1.0
