In [1]:
import pandas as pd
import json
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.options.display.max_columns = None

In [2]:
## Function to extract artist names
# text: str - string representation of list of dicts of length equal to number of artists
def get_artist(text):
    artists = ast.literal_eval(text)  # list (artists) containing (artists) dictionaries

    return ", ".join(artist['name'] for artist in artists)

# example to test
get_artist("[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Cs0zKBU1kc0i8ypK3B9ai'}, 'href': 'https://api.spotify.com/v1/artists/1Cs0zKBU1kc0i8ypK3B9ai', 'id': '1Cs0zKBU1kc0i8ypK3B9ai', 'name': 'David Guetta', 'type': 'artist', 'uri': 'spotify:artist:1Cs0zKBU1kc0i8ypK3B9ai'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/64M6ah0SkkRsnPGtGiRAbb'}, 'href': 'https://api.spotify.com/v1/artists/64M6ah0SkkRsnPGtGiRAbb', 'id': '64M6ah0SkkRsnPGtGiRAbb', 'name': 'Bebe Rexha', 'type': 'artist', 'uri': 'spotify:artist:64M6ah0SkkRsnPGtGiRAbb'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/1vyhD5VmyZ7KMfW5gqLgo5'}, 'href': 'https://api.spotify.com/v1/artists/1vyhD5VmyZ7KMfW5gqLgo5', 'id': '1vyhD5VmyZ7KMfW5gqLgo5', 'name': 'J Balvin', 'type': 'artist', 'uri': 'spotify:artist:1vyhD5VmyZ7KMfW5gqLgo5'}]")

'David Guetta, Bebe Rexha, J Balvin'

In [3]:
## Load and clean the data
# index_col=0 : consider first column as index
# sort by position in ascending order and reset index
df = pd.read_csv('played_out.csv', index_col=0).sort_values(by='position', ascending=True).reset_index(drop=True)

# track.type : always same value 'track'
# track.is_local : always same value 'local'
# track.track_number : position on its original album (not relevant for our analysis)
df.drop(columns = ['track.type', 'track.is_local', 'track.track_number'], inplace=True)

# extract artist names from 'track.artists' column
df['artist'] = df['track.artists'].apply(get_artist)

# drop the original 'track.artists' column
df.drop(columns = ['track.artists'], inplace=True)

df.rename(columns={'track.duration_ms':'duration_ms', 'track.explicit':'explicit', 'track.id':'track_id', 'track.name':'track_name', 'track.popularity':'popularity',
                'accousticness':'acousticness'}, inplace=True)

In [4]:
df.to_csv('played_out_cleaned.csv', index=False)

print(df.shape)

(4572, 23)


Song positions 50 and 51 have just 71 and 51 entries.
Which tells the user have recorded more than 50 songs.

In [8]:
# pd.set_option('display.max_rows', None)
## Sorted Dataframe by user and position
df = df[['position', 'user','first_genre', 'track_name', 'track_id',
        'popularity',  'explicit',
       'acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence',
       'artist', 'duration_ms', 'gender', 'age'
       ]].sort_values(
    by=['user', 'position', 'first_genre']).reset_index(drop=True)
df



Unnamed: 0,position,user,first_genre,track_name,track_id,popularity,explicit,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,artist,duration_ms,gender,age
0,0,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Subzero - Original Mix,7p6oXzBSPAXXz8Xb8gBPki,45,False,0.588000,0.793,0.608,0.861000,6.0,0.0991,-11.638,0.0,0.0673,124.988,0.1180,Ben Klock,383972,M,40
1,1,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,I'm Good (Blue),4uUG5RXrOk84mYEfFvj3cK,85,True,0.003830,0.561,0.965,0.000007,7.0,0.3710,-3.673,0.0,0.0343,128.040,0.3040,"David Guetta, Bebe Rexha",175238,M,40
2,2,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,I Don't Wanna Wait,331l3xABO0HMr1Kkyh2LZq,81,False,0.037500,0.681,0.714,0.000000,1.0,0.2320,-4.617,0.0,0.0309,129.976,0.5540,"David Guetta, OneRepublic",149667,M,40
3,3,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Love Tonight (David Guetta Remix Edit),2prnn41CblB8B4yWACDljP,76,False,0.015000,0.621,0.989,0.357000,10.0,0.1840,-4.225,0.0,0.0434,126.002,0.0348,"Shouse, David Guetta",158095,M,40
4,4,0eeb2810a28f4aec9fb457dd2049c015_Benedict Bond,3,Wide Open - Len Faki DjEdit,477I4wif0etzeupmlQzTxl,40,False,0.000077,0.886,0.616,0.885000,7.0,0.0987,-7.105,1.0,0.0835,128.015,0.0952,"DJ Hyperactive, Len Faki",435571,M,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4567,47,f9a2b3c8d4e1f7a0b5c6d9e2f1a3b8c,1,WILDFLOWER,3QaPy1KgI7nu9FJEQUgn6h,93,False,0.612000,0.467,0.247,0.000271,6.0,0.1700,-12.002,0.0,0.0431,148.101,0.1260,Billie Eilish,261466,M,20
4568,48,f9a2b3c8d4e1f7a0b5c6d9e2f1a3b8c,1,Kiss Me More (feat. SZA),3DarAbFujv6eYNliUTyqtz,86,True,0.259000,0.764,0.705,0.000089,8.0,0.1200,-3.463,1.0,0.0284,110.970,0.7810,"Doja Cat, SZA",208666,M,20
4569,49,f9a2b3c8d4e1f7a0b5c6d9e2f1a3b8c,1,I Wanna Be Yours,5XeFesFbtLpXzIVDNQP22n,92,False,0.136000,0.464,0.417,0.022000,0.0,0.0974,-9.345,0.0,0.0256,67.528,0.4790,Arctic Monkeys,183956,M,20
4570,50,f9a2b3c8d4e1f7a0b5c6d9e2f1a3b8c,1,Photograph,3FSPaBp49Clqq2p4zc9dbA,61,False,,,,,,,,,,,,Ed Sheeran,258986,M,20
