# LAB 4 : Collaborative Filtering on Last.fm Dataset

In this lab, we use the Last.fm Dataset (https://www.last.fm/)  - 360K Users (http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html) \
The dataset contains <user, artist, plays> tuples of 360,000 users.\
The data format of our database is: <em> user-mboxshal \t musicbrainz-artist-id \t artist-name \t plays. </em> 




In [4]:
import pandas as pd
import sys, os
import numpy as np
import gdown
from scipy.sparse import coo_matrix

parent_dir = os.getcwd()
path = os.path.dirname(parent_dir)
sys.path.append(path)

parent_dir = os.getcwd()
path = os.path.dirname(parent_dir)
sys.path.append(path)
# print(path)
file = path + r"\xlab-recommendation\usersha1-artmbid-artname-plays.tsv"
# print(file)

# Select an interesting data column (Select only "user", "artist", "artist-name", not select "artist-id").
user_artist_play_df = pd.read_table(file, usecols=[0, 2, 3], names=["user", "artist", "plays"])
user_artist_play_df

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
...,...,...,...
17535650,"sep 20, 2008",turbostaat,12
17535651,"sep 20, 2008",cuba missouri,11
17535652,"sep 20, 2008",little man tate,11
17535653,"sep 20, 2008",sigur rós,10


Checking and dropping missing values

In [5]:
print(user_artist_play_df.isnull().sum())
user_artist_play_df.dropna(inplace=True)

user        0
artist    204
plays       0
dtype: int64


Create sparse matrix

In [8]:
# dtype column "user" and "artist" to categoricals
user_artist_play_df["user"] = user_artist_play_df["user"].astype("category")
user_artist_play_df["artist"] = user_artist_play_df["artist"].astype("category")

# Using scipy to create sparse matrix in coordinate format
#bm-25 expect item-user (artist-user)
artist_user = coo_matrix(
    (
        user_artist_play_df["plays"].astype(float),
        (
            user_artist_play_df["artist"].cat.codes,
            user_artist_play_df["user"].cat.codes,
        ),
    )
)
print(artist_user)

  (45561, 0)	2137.0
  (90933, 0)	1099.0
  (185367, 0)	897.0
  (106704, 0)	717.0
  (155241, 0)	706.0
  (220128, 0)	691.0
  (177597, 0)	545.0
  (252797, 0)	507.0
  (259856, 0)	424.0
  (175802, 0)	403.0
  (278311, 0)	393.0
  (169906, 0)	387.0
  (126690, 0)	361.0
  (137062, 0)	358.0
  (253488, 0)	329.0
  (144293, 0)	316.0
  (37515, 0)	310.0
  (100639, 0)	302.0
  (19592, 0)	288.0
  (167984, 0)	281.0
  (230100, 0)	244.0
  (103711, 0)	232.0
  (184079, 0)	231.0
  (19356, 0)	229.0
  (144238, 0)	227.0
  :	:
  (102595, 358867)	23.0
  (218091, 358867)	23.0
  (152622, 358867)	21.0
  (118823, 358867)	20.0
  (73818, 358867)	19.0
  (186549, 358867)	18.0
  (251158, 358867)	16.0
  (41969, 358867)	15.0
  (68119, 358867)	14.0
  (160643, 358867)	14.0
  (161724, 358867)	14.0
  (258344, 358867)	14.0
  (263682, 358867)	14.0
  (73136, 358867)	12.0
  (99429, 358867)	12.0
  (178275, 358867)	12.0
  (217883, 358867)	12.0
  (254532, 358867)	12.0
  (265438, 358867)	12.0
  (263239, 358867)	12.0
  (271740, 358867)	12.

Weight matrix before training a model 
- Reducing the impact of users who have played the same artist thousands of times.
- Reducing the weight given to popular items


In [10]:
from implicit.nearest_neighbours import bm25_weight

artist_user = bm25_weight(artist_user, K1=100, B=0.8)
print(artist_user)

  (45561, 0)	608.3438475999682
  (90933, 0)	23.813448240322575
  (185367, 0)	247.46357050219146
  (106704, 0)	120.43845808174673
  (155241, 0)	298.7269873866248
  (220128, 0)	4.027344011217231
  (177597, 0)	433.30041900290246
  (252797, 0)	41.29197087071654
  (259856, 0)	397.83733173875993
  (175802, 0)	343.1625523273984
  (278311, 0)	51.04480251042432
  (169906, 0)	84.86181127768565
  (126690, 0)	11.217272456547382
  (137062, 0)	58.777876988163214
  (253488, 0)	510.99420983681904
  (144293, 0)	72.06845837090692
  (37515, 0)	226.41439479505402
  (100639, 0)	8.027965534731749
  (19592, 0)	401.44013272259446
  (167984, 0)	32.0690849851414
  (230100, 0)	22.133016014418757
  (103711, 0)	13.69359548930252
  (184079, 0)	29.522364611454623
  (19356, 0)	293.44140920719514
  (144238, 0)	2.6842524284743003
  :	:
  (102595, 358867)	2.244412075389815
  (218091, 358867)	0.4273118269779581
  (152622, 358867)	5.715727736081545
  (118823, 358867)	0.3403505503015668
  (73818, 358867)	2.4384682461686915

Train an ALS model using implicit

In [12]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
# Implicit expect user-item (user-artist)
user_artist = artist_user.T.tocsr()
model.fit(user_artist)

100%|██████████| 15/15 [07:55<00:00, 31.71s/it]


The result

In [15]:
userid = 12345
artists = user_artist_play_df["artist"]
ids, scores = model.recommend(userid, user_artist[userid], N=10, filter_already_liked_items=False)
# print(ids)
pd.DataFrame({"artist": artists[ids], "score": scores, "already_liked": np.in1d(ids, user_artist[userid].indices),})

Unnamed: 0,artist,score,already_liked
277670,cassie,0.993481,True
216200,massive attack,0.991298,True
192476,tata young,0.988862,True
204693,tabula rasa,0.975979,False
254160,sumi jo,0.970516,True
224982,bettie serveert,0.957951,True
163642,planet funk,0.957093,True
270553,stereotyp meets alhaca,0.957075,True
79669,led zeppelin,0.954306,True
166138,elvis presley,0.949569,False


In [22]:
itemid = 190297
print(f"Artist ID {itemid} : {artists[itemid]}")
ids, scores = model.similar_items(itemid)

# display the results using pandas for nicer formatting
pd.DataFrame({"artist": artists[ids], "score": scores})

Artist ID 190297 : the beatles


Unnamed: 0,artist,score
190297,the beatles,1.0
175951,creeper lagoon,0.938361
214133,alabína,0.938243
98664,ugk,0.936385
270849,van morrison,0.932441
167606,enigma,0.931289
54427,the pretenders,0.929949
188725,perfect dark,0.928534
265847,the kinks,0.928146
23476,neil young,0.928046
