In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from matplotlib import pyplot as plt
import seaborn
%matplotlib inline

### Data reading

In [3]:
scrobs = pd.read_csv("./data/lastfm_user_scrobbles.csv")

In [4]:
max(scrobs["scrobbles"])

352698

### Make scipy csr sparse matrix from lil with users as rows and artist as column with # scrobbles as value

In [5]:
scrobsLil = sparse.lil_matrix((max(scrobs["user_id"])+1, max(scrobs["artist_id"])+1)) # 0 indexing
scrobsLil[scrobs["user_id"], scrobs["artist_id"]] = scrobs["scrobbles"]
scrobs = scrobsLil.tocsr()

### Change #listens to ratings

This is to make collaborative filtering work, rather than values like 100, 90, 80 for the top 3 artists of a user, these would be scores representing how they might rate those artists. 

1. Divide by max
    * result: 1, .9, .8
    * ranking of a user's artists
    * 
2. Divide by sum
    * result: 0.37, 0.33, 0.29
    * proportion of total listens that artist occupies
    * users with many artists have lower ratings
    * ranking doesn't affect rating, so 

In [6]:
denom_max = np.repeat(scrobs.max(axis=1).A, scrobs.getnnz(axis=1))
scrobs.data /= denom_max


### 

In [7]:
# edges=[[1,2],[3,4],[1,5],[5,3]]
toy = np.array([[1,1,2,3,3], [2,3,1,1,2]])
toydata = np.array([100,10,200,300,30])
lil = sparse.lil_matrix((3,3))
lil[toy[0,:]-1,toy[1,:]-1] = toydata
csr = lil.tocsr()
csr.getnnz(axis=1)
csr.max(axis=1).A
denom = np.repeat(csr.max(axis=1).A,csr.getnnz(axis=1))
csr.data = csr.data/denom
csr.A

array([[0. , 1. , 0.1],
       [1. , 0. , 0. ],
       [1. , 0.1, 0. ]])

### Clustering by tags

In [10]:
big_set = pd.read_csv('./data/artists.csv', low_memory=False)

In [11]:
big_set

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist
0,cc197bad-dc9c-440d-a5b5-d52ba2e14234,Coldplay,Coldplay,United Kingdom,United Kingdom,rock; pop; alternative rock; british; uk; brit...,rock; alternative; britpop; alternative rock; ...,5381567.0,360111850.0,False
1,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,Radiohead,United Kingdom,United Kingdom,rock; electronic; alternative rock; british; g...,alternative; alternative rock; rock; indie; el...,4732528.0,499548797.0,False
2,8bfac288-ccc5-448d-9573-c33ea2aa5c30,Red Hot Chili Peppers,Red Hot Chili Peppers,United States,United States,rock; alternative rock; 80s; 90s; rap; metal; ...,rock; alternative rock; alternative; Funk Rock...,4620835.0,293784041.0,False
3,73e5e69d-3554-40d8-8516-00cb38737a1c,Rihanna,Rihanna,United States,Barbados; United States,pop; dance; hip hop; reggae; contemporary r b;...,pop; rnb; female vocalists; dance; Hip-Hop; Ri...,4558193.0,199248986.0,False
4,b95ce3ff-3d05-4e87-9e01-c97b66af13d4,Eminem,Eminem,United States,United States,turkish; rap; american; hip-hop; hip hop; hiph...,rap; Hip-Hop; Eminem; hip hop; pop; american; ...,4517997.0,199507511.0,False
...,...,...,...,...,...,...,...,...,...,...
1466078,1eab523e-98ff-4083-aa34-8922740bc696,정은지,,South Korea,South Korea,,,,,False
1466079,a18f0527-907e-42b0-8268-504966274581,남태현,,South Korea,,,,,,False
1466080,20a57e37-24b5-4301-855b-35076580fb88,헤일로,,South Korea,,,,,,False
1466081,83891a4d-1bf4-4abe-a483-5b3d9d614efa,서현진,,South Korea,South Korea,,,,,False


In [48]:
artist_tags=big_set[['artist_mb', 'tags_mb']]

In [70]:
artist_tags["tags_mb"].to_numpy()

array([list(['rock', 'pop', 'alternative rock', 'british', 'uk', 'britannique', 'britpop', 'pop rock', 'piano pop', 'piano rock', 'english', 'parlophone', 'rock and indie', 'ambient pop', 'pop/rock', 'chapel', 'post-britpop']),
       list(['rock', 'electronic', 'alternative rock', 'british', 'grunge', 'uk', 'britannique', 'britpop', 'art rock', 'experimental rock', 'english', 'chamber pop', 'parlophone', 'england', 'melancholic', 'oxford', 'bootleg', 'rock and indie', 'c’était mieux avant', 'art pop', 'nude', 'sacred cows']),
       list(['rock', 'alternative rock', '80s', '90s', 'rap', 'metal', 'american', 'crossover', 'usa', 'funk', 'funk rock', 'alternative', 'pop rock', 'funk metal', 'rap rock', '00s', 'dvd', 'pop and chart', '10s', 'funk rock tributo']),
       ..., list(['rock']), list(['chinese']), list(['chinese'])],
      dtype=object)

In [49]:
artist_tags = artist_tags.dropna()
artist_tags

Unnamed: 0,artist_mb,tags_mb
0,Coldplay,rock; pop; alternative rock; british; uk; brit...
1,Radiohead,rock; electronic; alternative rock; british; g...
2,Red Hot Chili Peppers,rock; alternative rock; 80s; 90s; rap; metal; ...
3,Rihanna,pop; dance; hip hop; reggae; contemporary r b;...
4,Eminem,turkish; rap; american; hip-hop; hip hop; hiph...
...,...,...
1466059,Kazushige Kinoshita,japanese; violinist; japan; chamber music; fre...
1466061,水越恵子,likedis auto
1466063,大槻ケンヂ,rock
1466069,孫耀威,chinese


In [50]:
artist_tags['tags_mb'][0]

'rock; pop; alternative rock; british; uk; britannique; britpop; pop rock; piano pop; piano rock; english; parlophone; rock and indie; ambient pop; pop/rock; chapel; post-britpop'

In [51]:
tagray=[tag.strip() for tag in artist_tags['tags_mb'][0].split(';')]

In [52]:
tagray

['rock',
 'pop',
 'alternative rock',
 'british',
 'uk',
 'britannique',
 'britpop',
 'pop rock',
 'piano pop',
 'piano rock',
 'english',
 'parlophone',
 'rock and indie',
 'ambient pop',
 'pop/rock',
 'chapel',
 'post-britpop']

In [54]:
artist_tags['tags_mb'] = artist_tags['tags_mb'].apply(lambda x: [tag.strip() for tag in x.split(';')])
artist_tags

Unnamed: 0,artist_mb,tags_mb
0,Coldplay,"[rock, pop, alternative rock, british, uk, bri..."
1,Radiohead,"[rock, electronic, alternative rock, british, ..."
2,Red Hot Chili Peppers,"[rock, alternative rock, 80s, 90s, rap, metal,..."
3,Rihanna,"[pop, dance, hip hop, reggae, contemporary r b..."
4,Eminem,"[turkish, rap, american, hip-hop, hip hop, hip..."
...,...,...
1466059,Kazushige Kinoshita,"[japanese, violinist, japan, chamber music, fr..."
1466061,水越恵子,[likedis auto]
1466063,大槻ケンヂ,[rock]
1466069,孫耀威,[chinese]


In [61]:
artist_tags_len = artist_tags
artist_tags_len['tags_len'] = artist_tags_len['tags_mb'].apply(lambda x: len(x))

In [65]:
tag_list=[]
for artist in artist_tags_len['tags_mb']:
    for tag in artist:
        if tag not in tag_list:
            tag_list.append(tag)

In [67]:
len(tag_list)

36091