In [1]:
import pandas as pd
import numpy as np
import pickle

import networkx as nx

In [18]:
# Procesing
# 1. keep only first idx for non-unique artist names
# 2. fill NAs with secondary col vals
# 3. drop unused cols
# 4. convert to dict
# 5. parse tags

In [2]:
df_original = pd.read_csv('artists.csv', true_values=['TRUE'], false_values=['FALSE'], low_memory=False)

In [3]:
df = df_original.copy()
df.head()

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist
0,cc197bad-dc9c-440d-a5b5-d52ba2e14234,Coldplay,Coldplay,United Kingdom,United Kingdom,rock; pop; alternative rock; british; uk; brit...,rock; alternative; britpop; alternative rock; ...,5381567.0,360111850.0,False
1,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,Radiohead,United Kingdom,United Kingdom,rock; electronic; alternative rock; british; g...,alternative; alternative rock; rock; indie; el...,4732528.0,499548797.0,False
2,8bfac288-ccc5-448d-9573-c33ea2aa5c30,Red Hot Chili Peppers,Red Hot Chili Peppers,United States,United States,rock; alternative rock; 80s; 90s; rap; metal; ...,rock; alternative rock; alternative; Funk Rock...,4620835.0,293784041.0,False
3,73e5e69d-3554-40d8-8516-00cb38737a1c,Rihanna,Rihanna,United States,Barbados; United States,pop; dance; hip hop; reggae; contemporary r b;...,pop; rnb; female vocalists; dance; Hip-Hop; Ri...,4558193.0,199248986.0,False
4,b95ce3ff-3d05-4e87-9e01-c97b66af13d4,Eminem,Eminem,United States,United States,turkish; rap; american; hip-hop; hip hop; hiph...,rap; Hip-Hop; Eminem; hip hop; pop; american; ...,4517997.0,199507511.0,False


In [22]:
len(df)

1466083

In [23]:
df.dtypes

mbid                 object
artist_mb            object
artist_lastfm        object
country_mb           object
country_lastfm       object
tags_mb              object
tags_lastfm          object
listeners_lastfm    float64
scrobbles_lastfm    float64
ambiguous_artist       bool
dtype: object

In [24]:
df.isna().sum()

mbid                      0
artist_mb                 8
artist_lastfm        479327
country_mb           803715
country_lastfm      1254585
tags_mb             1346137
tags_lastfm         1085008
listeners_lastfm     479323
scrobbles_lastfm     479323
ambiguous_artist          0
dtype: int64

In [4]:
df.drop_duplicates(subset=['artist_lastfm'], inplace=True)

In [5]:
df.isna().sum()

mbid                     0
artist_mb                4
artist_lastfm            1
country_mb          537840
country_lastfm      776962
tags_mb             859283
tags_lastfm         605205
listeners_lastfm         0
scrobbles_lastfm         0
ambiguous_artist         0
dtype: int64

In [6]:
df['artist_mb'].fillna(df['artist_lastfm'], inplace=True)
df['country_mb'].fillna(df['country_lastfm'], inplace=True)
df['tags_mb'].fillna(df['tags_lastfm'], inplace=True)
df.isna().sum()

mbid                     0
artist_mb                1
artist_lastfm            1
country_mb          469567
country_lastfm      776962
tags_mb             568592
tags_lastfm         605205
listeners_lastfm         0
scrobbles_lastfm         0
ambiguous_artist         0
dtype: int64

In [7]:
df.set_index(df['artist_mb'], inplace=True)
df.drop(labels=['artist_mb','mbid','artist_lastfm','country_lastfm','tags_lastfm','scrobbles_lastfm','ambiguous_artist'],
        axis=1, inplace=True)
df['listeners_lastfm'] = df['listeners_lastfm'] /df['listeners_lastfm'].abs().max()

In [8]:
df.head()

Unnamed: 0_level_0,country_mb,tags_mb,listeners_lastfm
artist_mb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Coldplay,United Kingdom,rock; pop; alternative rock; british; uk; brit...,1.0
Radiohead,United Kingdom,rock; electronic; alternative rock; british; g...,0.879396
Red Hot Chili Peppers,United States,rock; alternative rock; 80s; 90s; rap; metal; ...,0.858641
Rihanna,United States,pop; dance; hip hop; reggae; contemporary r b;...,0.847001
Eminem,United States,turkish; rap; american; hip-hop; hip hop; hiph...,0.839532


In [14]:
#df_scaled_listens = df.copy()
#df_scaled_listens['listeners_lastfm'] = df_scaled_listens['listeners_lastfm'] /df_scaled_listens['listeners_lastfm'].abs().max()
#df_scaled_listens.head()

In [9]:
artist_dict = df.T.to_dict(orient='index')

for artist, country in artist_dict['country_mb'].items():
    country = str(country)
    country = country.split(';')
    country = [c.strip() for c in country]
    artist_dict['country_mb'][artist] = country[-1]
    
for artist, tags in artist_dict['tags_mb'].items():
    tags = str(tags)
    tags = tags.split(';')
    tags = [tag.strip() for tag in tags]
    artist_dict['tags_mb'][artist] = tags[0:5] # select slice of tags we want

  artist_dict = df.T.to_dict(orient='index')


In [334]:
#country subgenres: 'alternative country', 'country rock', 'country pop', 'contemporary country', 'bro-country',
#                   'neo-traditionalist country', 'traditional country', 'alt-country', 'outlaw country'

4222


In [10]:
len(artist_dict['tags_mb'])

956369

In [11]:
filename = 'full_artists_dict'
outfile = open(filename, 'wb')
pickle.dump(artist_dict, outfile)
outfile.close()