# LAB 4 : Collaborative Filtering on Last.fm Dataset

In this lab, we use the Last.fm Dataset (https://www.last.fm/)  - 360K Users (http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html) \
The dataset contains <user, artist, plays> tuples of 360,000 users.\
The data format of our database is: <em> user-mboxshal \t musicbrainz-artist-id \t artist-name \t plays. </em>

Using the implicit.datasets module to download last.fm locally






In [5]:
import pandas as pd
import numpy as np
from implicit.datasets.lastfm import get_lastfm

# artists and users are the string arrays labeling each row and column of the artist_user_plays matrix

# The artist_user_plays matrix is a scipy sparse matrix representing the number of times each artist was played by users, 
# each row represents different artists, and each column represents different users.

artists, users, artist_user_plays = get_lastfm()
# print(artist_user_plays)
print(artists[0:10])



[' 2 ' ' 58725ab=>' ' 80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari'
 ' amy winehouse' ' cours de la somme' ' fatboy slim' ' kanye west'
 ' mala rodriguez' ' mohamed lamine' ' oliver shanti & friends']


Weight matrix before training a model 
- Reducing the impact of users who have played the same artist thousands of times.
- Reducing the weight given to popular items


In [3]:
from implicit.nearest_neighbours import bm25_weight

artist_user = bm25_weight(artist_user_plays, K1=100, B=0.8)
print(artist_user)

  (0, 73470)	464.12640081352487
  (0, 97856)	395.4254916528028
  (0, 235382)	917.7576795317125
  (0, 266072)	801.254668853217
  (1, 171865)	479.0537259553822
  (2, 180892)	701.6462524574976
  (3, 285031)	469.2366708878609
  (4, 15103)	274.6530366072618
  (5, 81700)	392.0203057167537
  (6, 284057)	624.2906299439671
  (7, 335320)	482.02241184218633
  (8, 182831)	763.8439378564416
  (9, 12461)	115.27320299060298
  (10, 78717)	79.93913331578632
  (10, 149431)	82.3790196718816
  (10, 220512)	80.7530809045117
  (10, 261830)	81.8842571288024
  (10, 280610)	80.1359339749864
  (10, 297146)	80.33706026790095
  (11, 296825)	764.3099590503471
  (12, 332435)	833.4437619557374
  (13, 41075)	510.69738868947763
  (14, 298571)	759.3034882234267
  (15, 295693)	791.1280662266632
  (16, 185703)	550.3715435013044
  :	:
  (292364, 4775)	178.0367505676167
  (292365, 147943)	970.515999738615
  (292366, 95230)	778.3288671797941
  (292367, 56086)	174.82443977973026
  (292367, 137277)	693.1595624862529
  (292367

Train an ALS model using implicit

In [4]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
# Implicit expect user-item (user-artist)
user_artist = artist_user.T.tocsr()

model.fit(user_artist)

100%|██████████| 15/15 [10:02<00:00, 40.15s/it]


The result

In [4]:
# userid = 12345

# ids, scores = model.recommend(userid, user_artist[userid], N=10, filter_already_liked_items=False)
# # print(ids)
# df = pd.DataFrame({"artist": artists[ids], "score": scores, "already_liked": np.in1d(ids, user_artist[userid].indices),})

In [15]:
import gradio as gr

def music_recommend(userid):
    userid = int(userid)
    ids, scores = model.recommend(userid, user_artist[userid], N=10, filter_already_liked_items=True)
    df = pd.DataFrame({"artist": artists[ids], "score": scores})
    return df


demo = gr.Interface(
    fn=music_recommend,
    inputs="text",
    outputs="dataframe",
)
demo.launch()


Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




Artist ID 108605 : epik high
Artist ID 246357 : super junior
Artist ID 181675 : maroon 5


In [9]:
# itemid = list(artists).index("maroon 5")
# print(f"Artist ID {itemid} : {artists[itemid]}")
# ids, scores = model.similar_items(itemid)

# # display the results using pandas for nicer formatting
# pd.DataFrame({"artist": artists[ids], "score": scores})

Artist ID 181675 : maroon 5


Unnamed: 0,artist,score
0,maroon 5,1.0
1,jason mraz,0.989504
2,the fray,0.9882
3,james blunt,0.988145
4,onerepublic,0.987358
5,black eyed peas,0.986483
6,justin timberlake,0.985625
7,keane,0.985596
8,coldplay,0.985125
9,mika,0.985


In [14]:
import gradio as gr

def music_similarity(artist_name):
    itemid = list(artists).index(artist_name)
    print(f"Artist ID {itemid} : {artists[itemid]}")
    ids, scores = model.similar_items(itemid)
    df = pd.DataFrame({"artist": artists[ids], "score": scores})
    return df


demo = gr.Interface(
    fn=music_similarity,
    inputs="text",
    outputs="dataframe",
)
demo.launch()

Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\K\.pyenv\pyenv-win\versions\3.9.6\lib\site-packages\gradio\routes.py", line 516, in predict
    output = await route_utils.call_process_api(
  File "c:\Users\K\.pyenv\pyenv-win\versions\3.9.6\lib\site-packages\gradio\route_utils.py", line 219, in call_process_api
    output = await app.get_blocks().process_api(
  File "c:\Users\K\.pyenv\pyenv-win\versions\3.9.6\lib\site-packages\gradio\blocks.py", line 1437, in process_api
    result = await self.call_function(
  File "c:\Users\K\.pyenv\pyenv-win\versions\3.9.6\lib\site-packages\gradio\blocks.py", line 1109, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "c:\Users\K\.pyenv\pyenv-win\versions\3.9.6\lib\site-packages\anyio\to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "c:\Users\K\.pyenv\pyenv-win\versions\3.9.6\lib\site-packages\anyio\_backends\_asyncio.py", line 877, in run_sync_in_worker_thread
    retur

Artist ID 181675 : maroon 5
Artist ID 108605 : epik high
