In [64]:
import pandas as pd

artists = pd.read_csv('dataset/remapped/artists.csv')
tags = pd.read_csv('dataset/remapped/tags.csv')
user_artists = pd.read_csv('dataset/remapped/user_artists.csv')
user_tags = pd.read_csv('dataset/remapped/user_tags.csv')

print(f"Original length: {len(user_artists)}")
user_tags.head()

Original length: 92834


Unnamed: 0,userID,artistID,tagID,day,month,year
0,0,46,12,1,4,2009
1,0,46,14,1,4,2009
2,0,46,17,1,4,2009
3,0,46,20,1,4,2009
4,0,46,40,1,4,2009


In [65]:
user_artist_ids = set(user_artists['artistID'].unique())
user_tag_artist_ids = set(user_tags['artistID'].unique())

# Step 2: Find discrepancies
missing_in_user_tags = user_artist_ids - user_tag_artist_ids
missing_in_user_artists = user_tag_artist_ids - user_artist_ids

# Print debug information
print(f"Artists in user_artists but not in user_tags: {len(missing_in_user_tags)}")
print(f"Artists in user_tags but not in user_artists: {len(missing_in_user_artists)}")

user_artists = user_artists[~user_artists['artistID'].isin(missing_in_user_tags)]
user_tags = user_tags[~user_tags['artistID'].isin(missing_in_user_artists)]


Artists in user_artists but not in user_tags: 5499
Artists in user_tags but not in user_artists: 0


In [66]:
import numpy as np
from scipy.sparse import coo_matrix

def build_user_item_matrix(user_artists):
    users = user_artists['userID'].values
    items = user_artists['artistID'].values
    weights = user_artists['weight'].values

    num_users = users.max() + 1
    num_items = items.max() + 1

    user_item_matrix = coo_matrix((weights, (users, items)), shape=(num_users, num_items))
    return user_item_matrix

user_item_matrix = build_user_item_matrix(user_artists)
user_item_matrix.shape

(1892, 17631)

In [67]:
def build_item_tag_matrix(user_tags, tags):
    items = user_tags['artistID'].values
    tag_ids = user_tags['tagID'].values

    # Use binary 1 for tag occurrence
    data = np.ones_like(items)

    num_items = items.max() + 1
    num_tags = tags['tagID'].max() + 1

    item_tag_matrix = coo_matrix((data, (items, tag_ids)), shape=(num_items, num_tags))
    return item_tag_matrix

item_tag_matrix = build_item_tag_matrix(user_tags, tags)
item_tag_matrix.shape

(17631, 11946)

In [68]:
assert user_item_matrix.shape[1] == item_tag_matrix.shape[0], (
    "The number of items (artists) must be the same in both matrices!"
)

In [69]:
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

train, test = random_train_test_split(user_item_matrix, test_percentage=0.2)

item_features = item_tag_matrix.tocsr()


In [70]:
model = LightFM(loss='warp')

In [71]:
EPOCHS = 10
THREADS = 16

In [72]:
model.fit(train, epochs=EPOCHS, num_threads=THREADS)
print('Finished fit')

Finished fit


In [74]:
precision = precision_at_k(model, train, k=5).mean()
print(f"Precision: {precision:.2f}")

Precision: 0.42


In [None]:
# Using lightfm dataset format
max_item_id = user_item_matrix.shape[1]
max_user_id = user_item_matrix.shape[0]