In [1]:
import pandas as pd
import json
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.options.display.max_columns = None

In [2]:
## Function to extract artist names
# text: str - string representation of list of dicts of length equal to number of artists
def get_artist(text):
    artists = ast.literal_eval(text)  # list (artists) containing (artists) dictionaries

    return ", ".join(artist['name'] for artist in artists)

In [3]:
# example to test
get_artist("[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Cs0zKBU1kc0i8ypK3B9ai'}, 'href': 'https://api.spotify.com/v1/artists/1Cs0zKBU1kc0i8ypK3B9ai', 'id': '1Cs0zKBU1kc0i8ypK3B9ai', 'name': 'David Guetta', 'type': 'artist', 'uri': 'spotify:artist:1Cs0zKBU1kc0i8ypK3B9ai'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/64M6ah0SkkRsnPGtGiRAbb'}, 'href': 'https://api.spotify.com/v1/artists/64M6ah0SkkRsnPGtGiRAbb', 'id': '64M6ah0SkkRsnPGtGiRAbb', 'name': 'Bebe Rexha', 'type': 'artist', 'uri': 'spotify:artist:64M6ah0SkkRsnPGtGiRAbb'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/1vyhD5VmyZ7KMfW5gqLgo5'}, 'href': 'https://api.spotify.com/v1/artists/1vyhD5VmyZ7KMfW5gqLgo5', 'id': '1vyhD5VmyZ7KMfW5gqLgo5', 'name': 'J Balvin', 'type': 'artist', 'uri': 'spotify:artist:1vyhD5VmyZ7KMfW5gqLgo5'}]")

'David Guetta, Bebe Rexha, J Balvin'

In [4]:
## Load and clean the data
# index_col=0 : consider first column as index
# sort by position in ascending order and reset index
df = pd.read_csv('played_out.csv', index_col=0).sort_values(by='position', ascending=True).reset_index(drop=True)

# track.type : always same value 'track'
# track.is_local : always same value 'local'
# track.track_number : position on its original album (not relevant for our analysis)
df.drop(columns = ['track.type', 'track.is_local', 'track.track_number'], inplace=True)

# extract artist names from 'track.artists' column
df['artist'] = df['track.artists'].apply(get_artist)

# drop the original 'track.artists' column
df.drop(columns = ['track.artists'], inplace=True)

df.rename(columns={'track.duration_ms':'duration_ms', 'track.explicit':'explicit', 'track.id':'track_id', 'track.name':'track_name', 'track.popularity':'popularity'}, inplace=True)
df.to_csv('played_out_cleaned.csv')
df.head()

Unnamed: 0,position,played_at,duration_ms,explicit,track_id,track_name,popularity,first_genre,user,gender,age,accousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,artist
0,0,2025-10-1T01:00:00:000Z,383972,False,7p6oXzBSPAXXz8Xb8gBPki,Subzero - Original Mix,45,3,19befa2d20a544b3819a1236cf7536c4_Harsh1,F,20,0.588,0.793,0.608,0.861,6.0,0.0991,-11.638,0.0,0.0673,124.988,0.118,Ben Klock
1,0,2025-10-1T01:00:00:000Z,254200,False,3ENHpbTuY72FukZbwGP6bc,Tear Away,67,4,24bebb12512f476a8e08b14a1aaa08ea_Hannah,F,40,0.00055,0.611,0.648,0.0028,0.0,0.0846,-4.579,1.0,0.0441,132.983,0.457,Drowning Pool
2,0,2025-10-1T01:00:00:000Z,254466,False,6nz35DNIzbtj5ztpDEcW1j,"Kick, Push",64,2,6016a0c171f34d44bc5840e0f521a034_D,F,60,,,,,,,,,,,,Lupe Fiasco
3,0,2025-10-1T01:00:00:000Z,383972,False,7p6oXzBSPAXXz8Xb8gBPki,Subzero - Original Mix,45,3,6f0f1e2e101f4f208ede37b241d6fc4c_31owtsxu6xlbj...,M,20,0.588,0.793,0.608,0.861,6.0,0.0991,-11.638,0.0,0.0673,124.988,0.118,Ben Klock
4,0,2025-10-1T01:00:00:000Z,223760,False,1lK5iIMKifrxERzS3iimJH,Breathe on Me,56,1,f9a2b3c8d4e1f7a0b5c6d9e2f1a3b8c,M,20,0.00209,0.709,0.565,0.0125,5.0,0.0901,-6.395,0.0,0.0471,112.203,0.571,Britney Spears


In [5]:
print(df['position'].value_counts().sort_index() )

# positions where it does not have count as 89
for pos in range(df['position'].nunique()):
    if df['position'].value_counts().sort_index().iloc[pos] != 89:
        print("count not as 89 :-",'position :', pos,',','count :', df['position'].value_counts().sort_index().iloc[pos])


position
0     89
1     89
2     89
3     89
4     89
5     89
6     89
7     89
8     89
9     89
10    89
11    89
12    89
13    89
14    89
15    89
16    89
17    89
18    89
19    89
20    89
21    89
22    89
23    89
24    89
25    89
26    89
27    89
28    89
29    89
30    89
31    89
32    89
33    89
34    89
35    89
36    89
37    89
38    89
39    89
40    89
41    89
42    89
43    89
44    89
45    89
46    89
47    89
48    89
49    89
50    71
51    51
Name: count, dtype: int64
count not as 89 :- position : 50 , count : 71
count not as 89 :- position : 51 , count : 51


In [None]:
# should we remove these two rows ??
# or remove the first two rows (songs) of the corresponding users