In [31]:
import pandas as pd
import numpy as np
import glob
import json

from tqdm.notebook import tqdm

## DF Raw

Archivos descargados desde `https://www.kaggle.com/adityak80/spotify-millions-playlist?select=spotify_million_playlist_dataset_challenge`

In [37]:
archivos = glob.glob(
    '/media/giani/Gianicosas/spotify/data/*.json'
)

sample_size = 2000

In [38]:
def leer_json(ruta):
    json_data = {}
    
    with open(ruta) as f:
        json_data = json.load(f)
        
    return json_data.get('playlists')

De cada archivo `json`, se toman muestras al azar (según valor de `sample_size`)

In [39]:
df_spotify = pd.concat([
    pd.DataFrame(leer_json(f)).sample(n=int(sample_size / 1000), random_state=42)
    for f in tqdm(archivos, total=len(archivos))
]).reset_index(drop=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [40]:
print(df_spotify.shape)
df_spotify.head()

(2000, 12)


Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,music,False,521,1506643200,127,108,1,"[{'pos': 0, 'artist_name': 'Charli XCX', 'trac...",45,27364349,85,
1,fuego,False,737,1509062400,35,29,1,"[{'pos': 0, 'artist_name': 'Dropout', 'track_u...",17,7650968,26,
2,yoga,False,1521,1506470400,32,31,1,"[{'pos': 0, 'artist_name': 'Gorillaz', 'track_...",3,7031323,29,
3,yaaas,False,1737,1490313600,44,32,2,"[{'pos': 0, 'artist_name': 'Gallant', 'track_u...",11,11124040,27,
4,Gym Time,False,10521,1406764800,82,60,1,"[{'pos': 0, 'artist_name': 'OutKast', 'track_u...",27,21092031,47,


## Expansión de filas

Se expande la columna `tracks` de forma tal que se genera una fila por cada canción dentro de cada lista


In [41]:
df_explode = df_spotify.explode('tracks').reset_index(drop=True)

df_explode.shape

(131680, 12)

In [42]:
df_explode.head()

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,music,False,521,1506643200,127,108,1,"{'pos': 0, 'artist_name': 'Charli XCX', 'track...",45,27364349,85,
1,music,False,521,1506643200,127,108,1,"{'pos': 1, 'artist_name': 'The Chainsmokers', ...",45,27364349,85,
2,music,False,521,1506643200,127,108,1,"{'pos': 2, 'artist_name': 'MAX', 'track_uri': ...",45,27364349,85,
3,music,False,521,1506643200,127,108,1,"{'pos': 3, 'artist_name': 'Post Malone', 'trac...",45,27364349,85,
4,music,False,521,1506643200,127,108,1,"{'pos': 4, 'artist_name': 'AJR', 'track_uri': ...",45,27364349,85,


In [43]:
df_explode['tracks'][0]

{'pos': 0,
 'artist_name': 'Charli XCX',
 'track_uri': 'spotify:track:0y8Wf6ltWpGCpgqVa21QNX',
 'artist_uri': 'spotify:artist:25uiPmTg16RbhZWAqwLBy5',
 'track_name': 'After The Afterparty (feat. Lil Yachty)',
 'album_uri': 'spotify:album:4Txd0wc4WAgnoxXx0SLAlw',
 'duration_ms': 219480,
 'album_name': 'After The Afterparty (feat. Lil Yachty)'}

## Expansión de columnas

Se crean columnas a partir de los atributos dentro de cada track

In [44]:
def get_track_detail(t):
    return (t.get('pos'), t.get('artist_name'), t.get('track_uri'), t.get('artist_uri'), t.get('track_name'),
             t.get('album_uri'), t.get('duration_ms'), t.get('album_name'))

In [45]:
rangos = list(range(0, df_explode.shape[0], 100000))
new_cols = [
    'pos', 'artist_name', 'track_uri', 'artist_uri', 'track_name', 'album_uri', 'duration_ms', 'album_name'
]

for i, r in tqdm(enumerate(rangos), total=len(rangos)):

    if i < len(rangos) - 1:
        df_explode.loc[r:rangos[i+1], new_cols] = [
            get_track_detail(t)
            for t in df_explode.loc[r:rangos[i+1], 'tracks']
        ]
        
    else:
        df_explode.loc[r:, new_cols] = [
            get_track_detail(t)
            for t in df_explode.loc[r:, 'tracks']
        ]

  0%|          | 0/2 [00:00<?, ?it/s]

## Set Final

In [46]:
df_explode.head()

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description,pos,artist_name,track_uri,artist_uri,track_name,album_uri,album_name
0,music,False,521,1506643200,127,108,1,"{'pos': 0, 'artist_name': 'Charli XCX', 'track...",45,219480,85,,0.0,Charli XCX,spotify:track:0y8Wf6ltWpGCpgqVa21QNX,spotify:artist:25uiPmTg16RbhZWAqwLBy5,After The Afterparty (feat. Lil Yachty),spotify:album:4Txd0wc4WAgnoxXx0SLAlw,After The Afterparty (feat. Lil Yachty)
1,music,False,521,1506643200,127,108,1,"{'pos': 1, 'artist_name': 'The Chainsmokers', ...",45,247160,85,,1.0,The Chainsmokers,spotify:track:6RUKPb4LETWmmr3iAEQktW,spotify:artist:69GGBxA162lTqCwzJG5jLp,Something Just Like This,spotify:album:4JPguzRps3kuWDD5GS6oXr,Memories...Do Not Open
2,music,False,521,1506643200,127,108,1,"{'pos': 2, 'artist_name': 'MAX', 'track_uri': ...",45,223846,85,,2.0,MAX,spotify:track:1sYSP7gKa5kdKIfhANfori,spotify:artist:1bqxdqvUtPWZri43cKHac8,Lights Down Low,spotify:album:2qmysBdftCYXRjYHTVK3qo,Hell's Kitchen Angel
3,music,False,521,1506643200,127,108,1,"{'pos': 3, 'artist_name': 'Post Malone', 'trac...",45,220293,85,,3.0,Post Malone,spotify:track:3a1lNhkSLSkpJE4MSHpDu9,spotify:artist:246dkjvS1zLTtiykXe5h60,Congratulations,spotify:album:5s0rmjP8XOPhP6HhqOhuyC,Stoney
4,music,False,521,1506643200,127,108,1,"{'pos': 4, 'artist_name': 'AJR', 'track_uri': ...",45,201160,85,,4.0,AJR,spotify:track:3E2Zh20GDCR9B1EYjfXWyv,spotify:artist:6s22t5Y3prQHyaHWUN1R1C,Weak,spotify:album:7LACXphpLTluKLFqHIZ1Qq,The Click


In [47]:
df_explode.shape

(131680, 19)

In [48]:
df_explode.isna().sum()

name                  0
collaborative         0
pid                   0
modified_at           0
num_tracks            0
num_albums            0
num_followers         0
tracks                0
num_edits             0
duration_ms           0
num_artists           0
description      129607
pos                   0
artist_name           0
track_uri             0
artist_uri            0
track_name            0
album_uri             0
album_name            0
dtype: int64

## Serialización

In [49]:
df_explode.drop(columns=['tracks']).to_csv(
    f'../data/{sample_size}_spotify_sample.csv.gz',
    compression='gzip',
    index=False
)