In [1]:
import pandas as pd

artists = pd.read_csv('dataset/remapped/artists.csv')
tags = pd.read_csv('dataset/remapped/tags.csv')
user_artists = pd.read_csv('dataset/remapped/user_artists.csv')
user_tags = pd.read_csv('dataset/remapped/user_tags.csv')

print(f"Original length: {len(user_artists)}")
artists.head()

Original length: 92834


Unnamed: 0,id,name,url,pictureURL
0,0,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,1,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,2,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,3,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,4,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


In [2]:
from sklearn.preprocessing import MinMaxScaler
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from scipy.sparse import coo_matrix

# Create interactions with normalized weights
dataset = Dataset()
dataset.fit(users=user_artists['userID'].unique(),
            items=user_artists['artistID'].unique())

(interactions, weights) = dataset.build_interactions(
    zip(user_artists['userID'], user_artists['artistID'], user_artists['weight'])
)

# Recreate sparse weight matrix directly from the weights
weights_sparse = coo_matrix(
    (weights.data, (weights.row, weights.col)),
    shape=interactions.shape
)

assert interactions.shape == weights_sparse.shape, "Shapes of interactions and weights must match!"
assert interactions.nnz == weights_sparse.nnz, "Number of non-zero elements must match!"

print(interactions.nnz, weights_sparse.nnz)


print(weights_sparse)


92834 92834
<COOrdinate sparse matrix of dtype 'float32'
	with 92834 stored elements and shape (1892, 17632)>
  Coords	Values
  (0, 0)	13883.0
  (0, 1)	11690.0
  (0, 2)	11351.0
  (0, 3)	10300.0
  (0, 4)	8983.0
  (0, 5)	6152.0
  (0, 6)	5955.0
  (0, 7)	4616.0
  (0, 8)	4337.0
  (0, 9)	4147.0
  (0, 10)	3923.0
  (0, 11)	3782.0
  (0, 12)	3735.0
  (0, 13)	3644.0
  (0, 14)	3579.0
  (0, 15)	3312.0
  (0, 16)	3301.0
  (0, 17)	2927.0
  (0, 18)	2720.0
  (0, 19)	2686.0
  (0, 20)	2654.0
  (0, 21)	2619.0
  (0, 22)	2584.0
  (0, 23)	2547.0
  (0, 24)	2397.0
  :	:
  (1891, 7993)	284.0
  (1891, 7995)	650.0
  (1891, 7996)	456.0
  (1891, 7997)	1068.0
  (1891, 7999)	626.0
  (1891, 8000)	613.0
  (1891, 8005)	655.0
  (1891, 8017)	640.0
  (1891, 8174)	232.0
  (1891, 8178)	429.0
  (1891, 8180)	607.0
  (1891, 8182)	724.0
  (1891, 9413)	793.0
  (1891, 9601)	228.0
  (1891, 17624)	705.0
  (1891, 12989)	278.0
  (1891, 12991)	346.0
  (1891, 17625)	535.0
  (1891, 15542)	443.0
  (1891, 17626)	758.0
  (1891, 17627)	337.0


In [3]:

# Split into train and test
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)
train_weights, test_weights = random_train_test_split(weights_sparse, test_percentage=0.2, random_state=42)



In [8]:
from sklearn.preprocessing import MinMaxScaler
# Initialize the model
model = LightFM(loss='warp', item_alpha=1e-6, user_alpha=1e-6)

# Normalizzeweights


# Train with weights
EPOCHS = 10
THREADS = 4
model.fit(train, sample_weight=train_weights, epochs=EPOCHS, num_threads=THREADS)


<lightfm.lightfm.LightFM at 0x7f8ff6f95fd0>

In [5]:
from lightfm.evaluation import precision_at_k, recall_at_k
precision = precision_at_k(model,train, k=10, num_threads=THREADS).mean()
recall = recall_at_k(model,train, k=10, num_threads=THREADS).mean()

precision_test = precision_at_k(model,test, k=10, num_threads=THREADS).mean()
recall_test = recall_at_k(model,test, k=10, num_threads=THREADS).mean()

NameError: name 'model' is not defined

In [6]:
print(f"Train precision@10: {precision}")
print(f"Train recall@10: {recall}")
print(f"Test precision@10: {precision_test}")
print(f"Test recall@10: {recall_test}")
precision_at_k

NameError: name 'precision' is not defined

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

# extract item similarities
item_biases, item_embeddings = model.get_item_representations()

item_to_item = pd.DataFrame(cosine_similarity(item_embeddings))

rhcp_id = artists[artists['name'] == 'Red Hot Chili Peppers']['id'].values[0]
similar = item_to_item[rhcp_id].sort_values(ascending=False).head(20)

In [47]:
# Get the names of the top 10 similar artists
similar_artists = artists.set_index('id').loc[similar.index]['name'].values

similar_artists

array(['Red Hot Chili Peppers', "2 Many DJ's", 'Dealema', 'Plies',
       'George Jones', 'The Psychedelic Furs', 'Kate Voegele', 'Common',
       'Dulce María', 'Coconut Records', 'Astor Piazzolla',
       'The Spill Canvas', 'Tequila Baby', 'Velcra', 'Skeletal Family',
       'Laith Al-Deen', 'Masacre', 'Fjordne', 'Animal Collective',
       'Tynisha Keli'], dtype=object)

In [4]:

# Merge user_tags with tags to get artistID, tagID, and count tag occurrences
tag_data = pd.merge(user_tags, tags, on='tagID')
tag_data = tag_data.groupby(['artistID', 'tagID']).size().reset_index(name='count')

# Map artistID and tagID to consecutive indices based on the dataset
artist_mapping = {artist: idx for idx, artist in enumerate(user_artists['artistID'].unique())}
tag_mapping = {tag: idx for idx, tag in enumerate(tags['tagID'].unique())}

# Map artistID and tagID to indices
tag_data['artist_idx'] = tag_data['artistID'].map(artist_mapping)
tag_data['tag_idx'] = tag_data['tagID'].map(tag_mapping)

# Build the sparse matrix (artist x tag)
item_features = coo_matrix(
    (tag_data['count'], (tag_data['artist_idx'], tag_data['tag_idx'])),
    shape=(len(artist_mapping), len(tag_mapping))
)

In [5]:
print(tag_data.shape, interactions.shape, tags.shape, artists.shape, item_features.shape)

(108437, 5) (1892, 17632) (11946, 2) (17632, 4) (17632, 11946)


In [25]:
model = LightFM(loss='warp', item_alpha=1e-6, user_alpha=1e-6)

# Train with item features
model.fit(train,
          item_features=item_features,
          sample_weight=train_weights,
          epochs=1,
          num_threads=4)

<lightfm.lightfm.LightFM at 0x7ff0adbabb00>

In [None]:
# Evaluate the model
from lightfm.evaluation import precision_at_k, recall_at_k
precision = precision_at_k(model, train, item_features=item_features, k=10, num_threads=4).mean()
recall = recall_at_k(model, train, item_features=item_features, k=10, num_threads=8).mean()

print(f"Precision: {precision}")
print(f"Recall: {recall}")

In [6]:
print(train.shape, test.shape)

(1892, 17632) (1892, 17632)


In [8]:
# Build a bigger model with 30 components
# Fit partially to display progress

model_base = LightFM(loss='warp',item_alpha=1e-6, user_alpha=1e-6, no_components=10)
model_features = LightFM(loss='warp', item_alpha=1e-6, user_alpha=1e-6, no_components=30)
model_noweights = LightFM(loss='warp', item_alpha=1e-6, user_alpha=1e-6, no_components=30)
model_noweights_features = LightFM(loss='warp', item_alpha=1e-6, user_alpha=1e-6, no_components=30)

EPOCHS = 20
THREADS = 4

model_base.fit(train, sample_weight=train_weights, epochs=EPOCHS, num_threads=THREADS,verbose=True)
model_features.fit(train, item_features=item_features, sample_weight=train_weights, epochs=EPOCHS, num_threads=THREADS,verbose=True)
model_noweights.fit(train, epochs=EPOCHS, num_threads=THREADS,verbose=True)
model_noweights_features.fit(train, item_features=item_features, epochs=EPOCHS, num_threads=THREADS,verbose=True)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch: 100%|██████████| 20/20 [00:34<00:00,  1.71s/it]
Epoch: 100%|██████████| 20/20 [01:52<00:00,  5.61s/it]
Epoch: 100%|██████████| 20/20 [00:42<00:00,  2.13s/it]
Epoch: 100%|██████████| 20/20 [02:06<00:00,  6.34s/it]


<lightfm.lightfm.LightFM at 0x7f79e73f4440>

In [9]:
# Calculate precision and AUC score
from lightfm.evaluation import precision_at_k, auc_score

precision_base = precision_at_k(model_base, test, train_interactions=train, k=10, num_threads=THREADS).mean()
auc_base = auc_score(model_base, test, train_interactions=train, num_threads=THREADS).mean()

precision_features = precision_at_k(model_features, test, item_features=item_features, train_interactions=train, k=10, num_threads=THREADS).mean()
auc_features = auc_score(model_features, test, item_features=item_features, train_interactions=train, num_threads=THREADS).mean()

precision_noweights = precision_at_k(model_noweights, test, train_interactions=train, k=10, num_threads=THREADS).mean()
auc_noweights = auc_score(model_noweights, test, train_interactions=train, num_threads=THREADS).mean()

precision_noweights_features = precision_at_k(model_noweights_features, test, item_features=item_features, train_interactions=train, k=10, num_threads=THREADS).mean()
auc_noweights_features = auc_score(model_noweights_features, test, item_features=item_features, train_interactions=train, num_threads=THREADS).mean()

In [10]:
# Print results
print(f"Base model - Precision@10: {precision_base}, AUC: {auc_base}")
print(f"Features model - Precision@10: {precision_features}, AUC: {auc_features}")
print(f"No weights model - Precision@10: {precision_noweights}, AUC: {auc_noweights}")
print(f"No weights, features model - Precision@10: {precision_noweights_features}, AUC: {auc_noweights_features}")

Base model - Precision@10: 0.1335991621017456, AUC: 0.8645904064178467
Features model - Precision@10: 0.014088250696659088, AUC: 0.5344148874282837
No weights model - Precision@10: 0.1514088213443756, AUC: 0.8739895820617676
No weights, features model - Precision@10: 0.01105794869363308, AUC: 0.5017257928848267


In [110]:
import numpy as np
def get_similar_tag_ids(model,tag_id,k=3, user_tags=None,min_count=10):

    tag_embeddings = (model.item_embeddings.T / np.linalg.norm(model.item_embeddings, axis=1)).T

    query_embedding = tag_embeddings[tag_id]
    similarity = np.dot(tag_embeddings, query_embedding)

    # Filter out tags with low counts
    if user_tags is not None:
        tag_counts = user_tags.groupby('tagID').size()
        tag_counts = tag_counts[tag_counts >= min_count]
        print(f"Filtered out {len(tags) - len(tag_counts)} tags with less than {min_count} occurrences")
        similarity = similarity[tag_counts.index]

    most_similar = np.argsort(-similarity)[1:k+1]

    return most_similar

tag_id = 72
# Print tag name
tag_name = tags[tags['tagID'] == tag_id]['tagValue'].values[0]
print(f"Tag: {tag_name}")
similar_tag_ids = get_similar_tag_ids(model_features, tag_id, user_tags=user_tags,k=10, min_count=100)
similar_tag_ids


Tag: rock
Filtered out 11720 tags with less than 100 occurrences


array([209,  57,  65,  39,  17,  74, 186, 106, 190,  63])

In [111]:
# Print the similar tags
similar_tags = tags.set_index('tagID').loc[similar_tag_ids]['tagValue'].values
similar_tags

array(['00s', 'halbischt', 'abstract', 'gregorian chant', 'electronic',
       'psychedelic rock', 'electronica', 'kizomba', 'instrumental',
       'mille plateaux'], dtype=object)

In [146]:
item_features.nnz

# Get features for the rock column
rock_idx = tag_mapping[72]
print(tags[tags['tagID'] == rock_idx])

item_biases, item_embeddings = model_features.get_item_representations(features=item_features)

item_embeddings.shape

    tagID tagValue
72     72     rock


(17632, 30)

In [130]:
# Now, get the artists most similar to a specific tag
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_artists_to_tag(model, tag_id, item_features, k=3, user_artists=None, min_count=100):
    tag_embeddings = model.item_embeddings
    item_biases, item_embeddings = model.get_item_representations(features=item_features)
    query_embedding = item_embeddings.getcol(tag_mapping[tag_id]).toarray().flatten()

    # Calculate cosine similarity
    similarity = cosine_similarity(item_embeddings.T, [query_embedding])

    



    # Filter out artists with low counts
    if user_artists is not None:
        artist_counts = user_artists.groupby('artistID').size()
        artist_counts = artist_counts[artist_counts >= min_count]
        print(f"Filtered out {len(artists) - len(artist_counts)} artists with less than {min_count} occurrences")
        item_embeddings = item_embeddings[artist_counts.index]

        print(item_embeddings.shape, query_embedding)

    
    return most_similar

similar_artists = get_similar_artists_to_tag(model_features, tag_id, item_features, k=10, user_artists=user_artists, min_count=10)
similar_artists

AttributeError: 'numpy.ndarray' object has no attribute 'getcol'

In [113]:
# Get the names for similar atists
artist_names = artists.set_index('id').loc[similar_artists.index]['name'].values

# Get tag name
tag_name = tags[tags['tagID'] == tag_id]['tagValue'].values[0]
print(f"Artists similar to tag '{tag_name}':")
artist_names


Artists similar to tag 'rock':


array(['Marilyn Manson', 'U2', 'Demi Lovato', 'The Veronicas',
       'Glee Cast', 'Pearl Jam', 'David Bowie', 'Adam Lambert',
       'Nicole Scherzinger', 'Fall Out Boy'], dtype=object)

In [211]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def suggest_similar_tags(idx, model, item_features, user_tags, tags, top_k=10, min_count=1000):
    """
    Suggest similar tags for a chosen model index.

    Args:
    - idx: Index of the chosen model in the dataset.
    - model: The trained LightFM model.
    - item_features: Sparse matrix of item features.
    - user_tags: DataFrame with user tags.
    - tags: DataFrame with tagID and tagValue.
    - top_k: Number of similar tags to suggest.
    - min_count: Minimum count of tag occurrences to consider.

    Returns:
    - List of suggested tags.
    """
    # Precompute tag frequency and filter
    tag_counts = user_tags.groupby('tagID').size()
    tag_counts = tag_counts[tag_counts >= min_count]
    valid_tags = set(tag_counts.index)

    # Map tagID to tagValue for fast lookup
    tag_mapping = dict(zip(tags['tagID'], tags['tagValue']))

    # Get tags already associated with the chosen model
    filter_tags = set(item_features.getrow(idx).indices)

    # Get the item's representation
    item_representation = item_features.getrow(idx).dot(model.item_embeddings)

    # Normalize item embeddings for cosine similarity
    normalized_embeddings = model.item_embeddings / np.linalg.norm(model.item_embeddings, axis=1, keepdims=True)
    sims = cosine_similarity(item_representation, normalized_embeddings).flatten()

    # Sort indices by similarity (highest to lowest)
    recs = np.argsort(-sims)

    # Generate tag suggestions
    suggested_tags = []
    for offset_idx in recs:
        if len(suggested_tags) >= top_k:
            break
        # Skip tags already associated with the model
        if offset_idx in filter_tags:
            continue
        # Skip tags that don't meet the frequency threshold
        if offset_idx not in valid_tags:
            continue
        # Append valid tag suggestions
        if offset_idx in tag_mapping:
            suggested_tags.append(tag_mapping[offset_idx])

    return suggested_tags

# Example Usage
idx = 221  # Example model index
suggested_tags = suggest_similar_tags(idx, model_features, item_features, user_tags, tags)

# Display the results
print(f'Suggested Tags for artist {idx}:')
print(suggested_tags)


Suggested Tags for artist 221:
['experimental', 'new wave', 'pop', 'dance', 'hip-hop', 'singer-songwriter', 'electronic', 'progressive rock', 'british', 'alternative rock']
