In [31]:
import pandas as pd

artists = pd.read_csv('dataset/remapped/artists.csv')
tags = pd.read_csv('dataset/remapped/tags.csv')
user_artists = pd.read_csv('dataset/remapped/user_artists.csv')
user_tags = pd.read_csv('dataset/remapped/user_tags.csv')

print(f"Original length: {len(user_artists)}")
artists.head()

Original length: 92834


Unnamed: 0,id,name,url,pictureURL
0,0,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,1,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,2,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,3,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,4,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


In [3]:
user_artist_ids = set(user_artists['artistID'].unique())
user_tag_artist_ids = set(user_tags['artistID'].unique())

# Step 2: Find discrepancies
missing_in_user_tags = user_artist_ids - user_tag_artist_ids
missing_in_user_artists = user_tag_artist_ids - user_artist_ids

# Print debug information
print(f"Artists in user_artists but not in user_tags: {len(missing_in_user_tags)}")
print(f"Artists in user_tags but not in user_artists: {len(missing_in_user_artists)}")

user_artists = user_artists[~user_artists['artistID'].isin(missing_in_user_tags)]
user_tags = user_tags[~user_tags['artistID'].isin(missing_in_user_artists)]


Artists in user_artists but not in user_tags: 5499
Artists in user_tags but not in user_artists: 0


In [4]:
import numpy as np
from scipy.sparse import coo_matrix

def build_user_item_matrix(user_artists):
    users = user_artists['userID'].values
    items = user_artists['artistID'].values
    weights = user_artists['weight'].values

    num_users = users.max() + 1
    num_items = items.max() + 1

    user_item_matrix = coo_matrix((weights, (users, items)), shape=(num_users, num_items))
    return user_item_matrix

user_item_matrix = build_user_item_matrix(user_artists)
user_item_matrix.shape

(1892, 17631)

In [5]:
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

train, test = random_train_test_split(user_item_matrix, test_percentage=0.2)


In [6]:
model = LightFM(loss='warp')

In [7]:
EPOCHS = 10
THREADS = 16

In [75]:
model.fit(train, epochs=EPOCHS, num_threads=THREADS)
print('Finished fit')

Finished fit


In [17]:
precision = precision_at_k(model, train, k=5).mean()
print(f"Precision: {precision:.2f}")

Precision: 0.43


In [8]:
# Using lightfm dataset format
max_item_id = user_item_matrix.shape[1]
max_user_id = user_item_matrix.shape[0]

In [9]:
# make a prediction
import numpy as np

def recommend_artists_for_user(model, user_id, user_artists, artists, max_item_id, k=5, threads=1):
    """
    Recommend artists for a specific user.

    Parameters:
        model (LightFM): Trained LightFM model.
        user_id (int): User ID for which to make recommendations.
        user_artists (DataFrame): DataFrame with user-artist interactions. Should contain 'userID' and 'artistID' columns.
        artists (DataFrame): DataFrame with artist information. Should contain 'artistID' and 'name' columns.
        max_item_id (int): Maximum artist ID in the dataset.
        k (int): Number of recommendations to return.
        threads (int): Number of threads to use for prediction.

    Returns:
        list: List of top artist names recommended for the user.
    """
    # Get the user's interacted artist IDs
    user_artist_ids = user_artists[user_artists['userID'] == user_id]['artistID'].values

    # Identify artist IDs that the user hasn't interacted with
    all_artist_ids = np.arange(max_item_id)
    artist_ids_to_predict = np.setdiff1d(all_artist_ids, user_artist_ids)

    # Create a single-user array for prediction
    user_ids = np.full(len(artist_ids_to_predict), user_id, dtype=np.int32)

    # Generate predictions for the user
    predictions = model.predict(
        user_ids,
        artist_ids_to_predict,
        num_threads=threads
    )

    # Get the top-k artist IDs sorted by prediction scores
    top_artist_ids = artist_ids_to_predict[np.argsort(-predictions)][:k]
    
    # Retrieve artist names for the top-k predictions
    top_artists = artists.set_index('id').loc[top_artist_ids]['name'].values
    return top_artists.tolist()


user_to_recommend = 2
# Example usage
top_artists = recommend_artists_for_user(
    model=model,
    user_id=user_to_recommend,
    user_artists=user_artists,
    artists=artists,
    max_item_id=max_item_id,
    k=10,
    threads=THREADS
)

print("Top recommended artists:", top_artists)


ValueError: You must fit the model before trying to obtain predictions.

In [10]:
# Now, we show top 10 artists for the previous user (not predictions)
user_artists[user_artists['userID'] == user_to_recommend].merge(artists, left_on='artistID', right_on='id')[['name', 'weight']].sort_values(by='weight', ascending=False)



Unnamed: 0,name,weight
5,Depeche Mode,4983
7,Deep Forest,1807
8,Porcupine Tree,1208
9,De/Vision,903
10,Radiohead,826
11,Robbie Williams,777
12,VAST,743
13,Michael Jackson,732
14,עברי לידר,729
15,The Cure,724


In [11]:
from lightfm.data import Dataset

dataset = Dataset(user_identity_features=False, item_identity_features=True)

dataset.fit(users=user_artists['userID'].unique(),
            items=user_artists['artistID'].unique())

(interactions, weights) = dataset.build_interactions(
    zip(user_artists['userID'], user_artists['artistID'], user_artists['weight'])
)

print("Interactions Matrix Shape:", interactions.shape)
print("Weights Matrix Shape:", weights.shape)


Interactions Matrix Shape: (1892, 12133)
Weights Matrix Shape: (1892, 12133)


In [12]:
train, test = random_train_test_split(interactions, test_percentage=0.2)

In [86]:
model = LightFM(loss='warp')
model.fit(train, epochs=EPOCHS, num_threads=THREADS)


<lightfm.lightfm.LightFM at 0x7f2f7d3a3dd0>

In [90]:
precision = precision_at_k(model, train, k=5).mean()
print(f"Precision: {precision:.2f}")

Precision: 0.44


In [13]:
missing_tags = set(user_tags['tagID']) - set(tags['tagID'])
print("Missing tagIDs:", missing_tags)

# Filter out rows with missing tagIDs
valid_tagIDs = set(tags['tagID'])
user_tags = user_tags[user_tags['tagID'].isin(valid_tagIDs)]



Missing tagIDs: set()


In [14]:
import pandas as pd
from collections import Counter
from lightfm.data import Dataset

# Step 1: Map tagID to tagValue
tag_map = dict(zip(tags['tagID'], tags['tagValue']))

# Step 2: Aggregate tags for artists
artist_tags = (
    user_tags.groupby('artistID')['tagID']
    .apply(lambda x: Counter(x).most_common(10))  # Get top 10 tags per artist
    .reset_index(name='top_tags')
)

artist_tags['top_tags'] = artist_tags['top_tags'].apply(
    lambda x: [tag_map[tag[0]] for tag in x]  # Replace tagID with tagValue
)

# Step 3: Aggregate tags for users
user_tags_agg = (
    user_tags.groupby('userID')['tagID']
    .apply(lambda x: Counter(x).most_common(10))  # Get top 10 tags per user
    .reset_index(name='top_tags')
)

user_tags_agg['top_tags'] = user_tags_agg['top_tags'].apply(
    lambda x: [tag_map[tag[0]] for tag in x]  # Replace tagID with tagValue
)


In [15]:
print(artist_tags[artist_tags['top_tags'].apply(len) == 0])

print(artist_tags)

Empty DataFrame
Columns: [artistID, top_tags]
Index: []
       artistID                                           top_tags
0             0  [j-rock, visual kei, gothic, japanese, weeabo,...
1             1  [darkwave, german, gothic, seen live, industri...
2             2  [black metal, true norwegian black metal, norw...
3             3  [j-rock, japanese, visual kei, gothic, metal, ...
4             4  [darkwave, gothic, gothic rock, deathrock, cov...
...         ...                                                ...
12128     17623  [alternative, electronica, trip beat, 80s, noise]
12129     17625  [favorite, electronic, trip-hop, rock, alterna...
12130     17626                                  [industrial, ebm]
12131     17627                         [experimental, dead music]
12132     17630  [chillout, ambient, downtempo, avant-garde, tr...

[12133 rows x 2 columns]


In [16]:
# Drop all artists that do not have interactions
user_artists = user_artists[user_artists['artistID'].isin(artist_tags['artistID'])]

# Step 4: Initialize Dataset and fit user and item mappings
dataset = Dataset()
dataset.fit(users=user_artists['userID'].unique(),
            items=user_artists['artistID'].unique(),
            user_features=tags['tagValue'].unique(),
            item_features=tags['tagValue'].unique())


# Step 5: Build item features (artist embeddings)
artist_features = dataset.build_item_features(
    [(row['artistID'], row['top_tags']) for _, row in artist_tags.iterrows()]
)

# Step 6: Build user features (user embeddings)
user_features = dataset.build_user_features(
    [(row['userID'], row['top_tags']) for _, row in user_tags_agg.iterrows()]
)


(interactions, weights) = dataset.build_interactions(
    zip(user_artists['userID'], user_artists['artistID'], user_artists['weight'])
)

# Print feature matrix shapes
print("Artist Features Shape:", artist_features.shape)
print("User Features Shape:", user_features.shape)
print("Interactions Matrix Shape:", train.shape)


Artist Features Shape: (12133, 24079)
User Features Shape: (1892, 13838)
Interactions Matrix Shape: (1892, 12133)


In [17]:
train, test = random_train_test_split(interactions, test_percentage=0.2)

In [149]:
model = LightFM(loss='warp')
model.fit(train, item_features=artist_features, user_features=user_features, epochs=10, num_threads=4)

<lightfm.lightfm.LightFM at 0x7f2f7d3a2240>

In [150]:
precision = precision_at_k(model, train,item_features=artist_features, user_features=user_features, k=5).mean()
print(f"Precision: {precision:.2f}")

Precision: 0.40


In [151]:
# ok, remove user features
model = LightFM(loss='warp')
model.fit(train, item_features=artist_features, epochs=10, num_threads=4)

<lightfm.lightfm.LightFM at 0x7f2f7d3a14f0>

In [152]:
precision = precision_at_k(model, train, item_features=artist_features, k=5).mean()
print(f"Precision: {precision:.2f}")


Precision: 0.43


In [22]:
longer = LightFM(loss='warp')
longer.fit(train,sample_weight= item_features=artist_features, epochs=30, num_threads=4, verbose=True)

<lightfm.lightfm.LightFM at 0x7f757dbc7b00>

In [45]:
precision = precision_at_k(longer, train, item_features=artist_features, k=5, num_threads=4).mean()
precision_test = precision_at_k(longer, test, item_features=artist_features, k=5,num_threads=4).mean()
recall = recall_at_k(longer, train, item_features=artist_features, k=5, num_threads=4).mean()
print(f"Precision: {precision:.2f}")
print(f"Precision Test: {precision_test:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 0.47
Precision Test: 0.12
Recall: 0.07


In [46]:
# let's get item similarity
tagsValues = tags['tagValue']

# Get top 10 tags for red hot chili peppers and rage against the machine
rhcp_id = artists[artists['name'] == 'Red Hot Chili Peppers']['id'].values[0]
ratm_id = artists[artists['name'] == 'Rage Against the Machine']['id'].values[0]

rhcp_id, ratm_id

(214, 1959)

In [47]:
rhcp_tags = artist_tags[artist_tags['artistID'] == rhcp_id]['top_tags'].values[0]
ratm_tags = artist_tags[artist_tags['artistID'] == ratm_id]['top_tags'].values[0]

rhcp_tags, ratm_tags

(['rock',
  'alternative rock',
  'alternative',
  'funk',
  '90s',
  'funk rock',
  'classic rock',
  'american',
  'punk',
  'cover'],
 ['alternative',
  'rock',
  'alternative rock',
  'metal',
  '90s',
  'nu metal',
  'hard rock',
  'rapcore',
  'alternative metal',
  'political'])

In [48]:
new_item_tags = ['funk', 'political', 'alternative rock', 'nu metal', 'funk rock']

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

# extract item similarities
item_biases, item_embeddings = longer.get_item_representations()

item_to_item = pd.DataFrame(cosine_similarity(item_embeddings))

similar = item_to_item[rhcp_id].sort_values(ascending=False).head(20)



In [50]:
# Get the names of the top 10 similar artists
similar_artists = artists.set_index('id').loc[similar.index]['name'].values

similar_artists


array(['Red Hot Chili Peppers', 'Bob Dylan', 'Kelly Clarkson',
       "The Swingin' Neckbreakers", 'Kleitman, Michael', 'Maxwell',
       '松浦亜弥', 'Acoustic brasil', 'Gladiator Theme', 'Linkin Park',
       'Local Natives', 'Gareth Emery', 'John Lee Hooker',
       'Good Charlotte', 'Lazy Bitches', 'Buddy Guy & Junior Wells',
       'My Chemical Romance', 'Mooncake', 'Tamia', 'Wintersun'],
      dtype=object)

In [44]:
# Do it again but remove features
model = LightFM(loss='warp')
model.fit(train, epochs=20, num_threads=4, verbose=True)

item_biases, item_embeddings = model.get_item_representations()
item_to_item = pd.DataFrame(cosine_similarity(item_embeddings))
similar = item_to_item[rhcp_id].sort_values(ascending=False).head(10)
print(artists.set_index('id').loc[similar.index]['name'].values)


Epoch: 100%|██████████| 20/20 [01:03<00:00,  3.18s/it]


['Red Hot Chili Peppers' 'Delta Goodrem' 'Tamia' 'E-40'
 'Death Cab for Cutie' 'Animal Collective' 'Richard Marx' 'Lea Michele'
 'Unlord' 'Local Natives']
