In [5]:
import numpy as np
import pandas as pd
from scipy import sparse
from matplotlib import pyplot as plt
import seaborn
%matplotlib inline

### Data reading

In [86]:
scrobs = pd.read_csv("./data/lastfm_user_scrobbles.csv")

In [88]:
max(scrobs["scrobbles"])

352698

### Make scipy csr sparse matrix from lil with users as rows and artist as column with # scrobbles as value

In [89]:
scrobsLil = sparse.lil_matrix((max(scrobs["user_id"])+1, max(scrobs["artist_id"])+1)) # 0 indexing
scrobsLil[scrobs["user_id"], scrobs["artist_id"]] = scrobs["scrobbles"]
scrobs = scrobsLil.tocsr()

### Change #listens to ratings

This is to make collaborative filtering work, rather than values like 100, 90, 80 for the top 3 artists of a user, these would be scores representing how they might rate those artists. 

1. Divide by max
    * result: 1, .9, .8
    * ranking of a user's artists
    * 
2. Divide by sum
    * result: 0.37, 0.33, 0.29
    * proportion of total listens that artist occupies
    * users with many artists have lower ratings
    * ranking doesn't affect rating, so 

In [90]:
denom_max = np.repeat(scrobs.max(axis=1).A, scrobs.getnnz(axis=1))
scrobs.data /= denom_max

In [95]:
sorted(scrobs.data)[:100]

[7.591112325689083e-06,
 1.2635196603659153e-05,
 1.5182224651378166e-05,
 1.8728345350688267e-05,
 2.358546192127173e-05,
 2.5270393207318306e-05,
 2.948182740158966e-05,
 2.948182740158966e-05,
 3.5378192881907595e-05,
 3.7456690701376534e-05,
 3.7456690701376534e-05,
 3.7456690701376534e-05,
 4.127455836222553e-05,
 4.601932811780948e-05,
 5.054078641463661e-05,
 5.896365480317932e-05,
 6.486002028349725e-05,
 7.075638576381519e-05,
 7.32949756294206e-05,
 7.32949756294206e-05,
 7.32949756294206e-05,
 7.32949756294206e-05,
 7.32949756294206e-05,
 7.32949756294206e-05,
 7.32949756294206e-05,
 7.491338140275307e-05,
 9.203865623561896e-05,
 9.203865623561896e-05,
 0.00010613457864572278,
 0.0001099424634441309,
 0.0001099424634441309,
 0.0001120309441260407,
 0.00012382367508667657,
 0.0001297200405669945,
 0.00013080786940142318,
 0.00013080786940142318,
 0.0001383891502906172,
 0.0001383891502906172,
 0.0001383891502906172,
 0.0001383891502906172,
 0.0001383891502906172,
 0.00013838

### ----PLAYGROUND----

In [80]:
# edges=[[1,2],[3,4],[1,5],[5,3]]
toy = np.array([[1,1,2,3,3], [2,3,1,1,2]])
toydata = np.array([100,10,200,300,30])
lil = sparse.lil_matrix((3,3))
lil[toy[0,:]-1,toy[1,:]-1] = toydata
csr = lil.tocsr()
csr.getnnz(axis=1)
csr.max(axis=1).A
denom = np.repeat(csr.max(axis=1).A,csr.getnnz(axis=1))
csr.data = csr.data/denom
csr.A

array([[0. , 1. , 0.1],
       [1. , 0. , 0. ],
       [1. , 0.1, 0. ]])