# LAB 4 : Collaborative Filtering on Last.fm Dataset

In this lab, we use the Last.fm Dataset (https://www.last.fm/)  - 360K Users (http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html) \
The dataset contains <user, artist, plays> tuples of 360,000 users.\
The data format of our database is: <em> user-mboxshal \t musicbrainz-artist-id \t artist-name \t plays. </em> 




In [112]:
import pandas as pd
import sys, os
import numpy as np
import gdown
from scipy.sparse import coo_matrix

parent_dir = os.getcwd()
path = os.path.dirname(parent_dir)
sys.path.append(path)

parent_dir = os.getcwd()
path = os.path.dirname(parent_dir)
sys.path.append(path)
# print(path)
file = path + r"\xlab-recommendation\usersha1-artmbid-artname-plays.tsv"
# print(file)

# Select an interesting data column (Select only "artist-name", not select "artist-id").
user_artist_play_df = pd.read_table(file, usecols=[0, 2, 3], names=["user", "artist", "plays"])
user_artist_play_df

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
...,...,...,...
17535650,"sep 20, 2008",turbostaat,12
17535651,"sep 20, 2008",cuba missouri,11
17535652,"sep 20, 2008",little man tate,11
17535653,"sep 20, 2008",sigur rós,10


Checking and dropping missing values

In [113]:
print(user_artist_play_df.isnull().sum())
user_artist_play_df.dropna(inplace=True)

user        0
artist    204
plays       0
dtype: int64


Create sparse matrix

In [116]:
# dtype column "user" and "artist" to categoricals
user_artist_play_df["user"] = user_artist_play_df["user"].astype("category")
user_artist_play_df["artist"] = user_artist_play_df["artist"].astype("category")

# Using scipy to create sparse matrix in coordinate format
plays = coo_matrix(
    (
        user_artist_play_df["plays"].astype(float),
        (
            user_artist_play_df["user"].cat.codes.copy(),
            user_artist_play_df["artist"].cat.codes.copy(),
        ),
    )
)
print(plays)

  (0, 45561)	2137.0
  (0, 90933)	1099.0
  (0, 185367)	897.0
  (0, 106704)	717.0
  (0, 155241)	706.0
  (0, 220128)	691.0
  (0, 177597)	545.0
  (0, 252797)	507.0
  (0, 259856)	424.0
  (0, 175802)	403.0
  (0, 278311)	393.0
  (0, 169906)	387.0
  (0, 126690)	361.0
  (0, 137062)	358.0
  (0, 253488)	329.0
  (0, 144293)	316.0
  (0, 37515)	310.0
  (0, 100639)	302.0
  (0, 19592)	288.0
  (0, 167984)	281.0
  (0, 230100)	244.0
  (0, 103711)	232.0
  (0, 184079)	231.0
  (0, 19356)	229.0
  (0, 144238)	227.0
  :	:
  (358867, 102595)	23.0
  (358867, 218091)	23.0
  (358867, 152622)	21.0
  (358867, 118823)	20.0
  (358867, 73818)	19.0
  (358867, 186549)	18.0
  (358867, 251158)	16.0
  (358867, 41969)	15.0
  (358867, 68119)	14.0
  (358867, 160643)	14.0
  (358867, 161724)	14.0
  (358867, 258344)	14.0
  (358867, 263682)	14.0
  (358867, 73136)	12.0
  (358867, 99429)	12.0
  (358867, 178275)	12.0
  (358867, 217883)	12.0
  (358867, 254532)	12.0
  (358867, 265438)	12.0
  (358867, 263239)	12.0
  (358867, 271740)	12.

Weight matrix before training a model 
- Reducing the impact of users who have played the same artist thousands of times.
- Reducing the weight given to popular items


In [117]:
from implicit.nearest_neighbours import bm25_weight

plays = bm25_weight(plays, K1=100, B=0.8)
print(plays)

  (0, 45561)	776.3481353071093
  (0, 90933)	332.6857830957528
  (0, 185367)	517.1930792453228
  (0, 106704)	460.35869520636925
  (0, 155241)	551.4702766798206
  (0, 220128)	165.91979931560545
  (0, 177597)	593.8647735405481
  (0, 252797)	372.1647631439464
  (0, 259856)	549.0157144825586
  (0, 175802)	528.0973097732709
  (0, 278311)	371.0208919895359
  (0, 169906)	416.763906328725
  (0, 126690)	231.46517098971526
  (0, 137062)	378.0924070447568
  (0, 253488)	579.3425478829889
  (0, 144293)	389.7787644244667
  (0, 37515)	445.8621705318491
  (0, 100639)	252.80574613838223
  (0, 19592)	517.4123486228738
  (0, 167984)	298.6027434278712
  (0, 230100)	314.4799916500819
  (0, 103711)	272.46958942263416
  (0, 184079)	307.9087450145287
  (0, 19356)	441.75940983258073
  (0, 144238)	163.45991300802598
  :	:
  (358867, 102595)	187.17894437591065
  (358867, 218091)	123.46887222105828
  (358867, 152622)	216.28011890865318
  (358867, 118823)	105.4638376811993
  (358867, 73818)	169.4282359857642
  (358

Train an ALS model using implicit

In [123]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
# plays = plays.tocsr()
model.fit(plays)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:18<?, ?it/s]


KeyboardInterrupt: 

The result

In [122]:
userid = 8500
artists = user_artist_play_df["artist"]
ids, scores = model.recommend(userid, plays[userid], N=10, filter_already_liked_items=False)
# print(ids)
pd.DataFrame({"artist": artists[ids], "score": scores, "already_liked": np.in1d(ids, plays[userid].indices),})

TypeError: 'coo_matrix' object is not subscriptable

In [62]:
itemid = 125284
print(f"Artist ID {itemid} : {artists[itemid]}")
ids, scores = model.similar_items(itemid)

# display the results using pandas for nicer formatting
pd.DataFrame({"artist": artists[ids], "score": scores})

Artist ID 125284 : elvis presley


Unnamed: 0,artist,score
125284,elvis presley,1.0
60985,led zeppelin,1.0
8988,boondox,0.999999
184648,air,0.999997
69596,markus krunegård,0.734395
33904,humane,0.650002
195002,billie ray martin,0.643661
7683,nickelback,0.642985
52880,oasis,0.638421
18023,crass,0.632795
