In [1]:
import pandas as pd

artists = pd.read_csv('dataset/remapped/artists.csv')
tags = pd.read_csv('dataset/remapped/tags.csv')
user_artists = pd.read_csv('dataset/remapped/user_artists.csv')
user_tags = pd.read_csv('dataset/remapped/user_tags.csv')

print(f"Original length: {len(user_artists)}")
user_artists.head()

Original length: 92834


Unnamed: 0,userID,artistID,weight
0,0,45,13883
1,0,46,11690
2,0,47,11351
3,0,48,10300
4,0,49,8983


In [2]:
import numpy as np
from scipy.sparse import csr_matrix

# Assuming user_tags, tags, and artists DataFrames are already loaded

# 1. Get the distribution of tags for each artist
artist_tag_distribution = (
    user_tags.groupby(['artistID', 'tagID'])
    .size()
    .unstack(fill_value=0)  # Converts to wide format with tagIDs as columns
)
artist_tag_distribution.columns.name = None  # Remove column name for clarity
artist_tag_distribution.reset_index(inplace=True)  # Make artistID a regular column

artist_tag_distribution.shape

(12133, 9719)

In [3]:

# 2. Build bag-of-words embeddings
# Merge `user_tags` with `tags` to get the tag values
user_tags_with_values = user_tags.merge(tags, how='left', left_on='tagID', right_on='tagID')

# Combine all tag values for each artist
artist_bow = user_tags_with_values.groupby('artistID')['tagValue'].apply(
    lambda x: ' '.join(map(str, x))
).reset_index()


artist_bow.head()

Unnamed: 0,artistID,tagValue
0,0,weeabo jrock j-rock visual kei better than lad...
1,1,german seen live darkwave industrial german ge...
2,2,black metal black metal norwegian black metal ...
3,3,j-rock visual kei metal gothic japanese bazaro...
4,4,gothic gothic rock darkwave darkwave deathrock...


In [4]:

# Vectorize using CountVectorizer to build bag-of-words embeddings
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(artist_bow['tagValue'])

embedding_dim = len(vectorizer.get_feature_names_out())
print(f"Embedding dimension: {embedding_dim}")

Embedding dimension: 7592


In [68]:
# Create a DataFrame with bag-of-words embeddings
artist_embeddings = pd.DataFrame.sparse.from_spmatrix(
    bow_matrix, columns=vectorizer.get_feature_names_out(), index=artist_bow['artistID']
)

artist_embeddings.shape


(12133, 7592)

In [71]:
# Normalize rows in the embedding matrix
artist_embeddings = artist_embeddings.div(artist_embeddings.sum(axis=0), axis=1)

In [72]:
artist_embeddings.shape

(12133, 7592)

In [73]:
# 3. Build a sparse matrix of shape (artist_count, embedding_dim)
# Ensure all artists have rows in the sparse matrix
all_artists = artists[['id']].rename(columns={'id': 'artistID'})
artist_embeddings_full = all_artists.merge(artist_embeddings, how='left', on='artistID').fillna(0)

# Convert to sparse matrix
embedding_sparse_matrix = csr_matrix(artist_embeddings_full.drop(columns=['artistID']).values)

# Output results
print("Tag distribution by artist:")
print(artist_tag_distribution)

print("\nBag-of-Words Embeddings for artists:")
print(artist_embeddings)

print("\nSparse matrix shape (artist_count x embedding_dim):", embedding_sparse_matrix.shape)


Tag distribution by artist:
       artistID  0  1  2  3  4  5  6  7  8  ...  11935  11936  11937  11938  \
0             0  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
1             1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
2             2  0  0  0  3  0  0  0  0  0  ...      0      0      0      0   
3             3  2  0  0  0  0  0  1  0  0  ...      0      0      0      0   
4             4  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
...         ... .. .. .. .. .. .. .. .. ..  ...    ...    ...    ...    ...   
12128     17623  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12129     17625  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12130     17626  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12131     17627  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12132     17630  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   

       11939  11940  11

In [74]:
def get_artist_name(artist_id):
    return artists.loc[artists['id'] == artist_id, 'name'].values[0]

In [75]:
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_similar_artists(artist_id, embeddings, artist_index_map, k=5):
    """
    Get the top K most similar artists to the given artist_id.

    Parameters:
    - artist_id (int): The ID of the artist to find similar artists for.
    - embeddings (csr_matrix): The sparse matrix containing artist embeddings.
    - artist_index_map (dict): A mapping of artist IDs to row indices in the embeddings matrix.
    - k (int): The number of similar artists to return.

    Returns:
    - List of tuples: [(artist_id, similarity), ...] for the top K similar artists.
    """
    # Ensure the artist ID is in the index map
    if artist_id not in artist_index_map:
        raise ValueError(f"Artist ID {artist_id} not found in the embeddings.")

    # Get the index of the given artist
    artist_idx = artist_index_map[artist_id]
    
    # Compute cosine similarity between the given artist and all others
    artist_vector = embeddings[artist_idx]  # Sparse row for the given artist
    similarities = cosine_similarity(artist_vector, embeddings).flatten()
    
    # Get the top K similar indices (excluding itself)
    similar_indices = similarities.argsort()[::-1]  # Sort in descending order
    top_k_indices = [idx for idx in similar_indices if idx != artist_idx][:k]
    
    # Map indices back to artist IDs and return their similarities
    index_artist_map = {v: k for k, v in artist_index_map.items()}
    top_k_artists = [(index_artist_map[idx], similarities[idx]) for idx in top_k_indices]

    return top_k_artists

# Create a mapping of artist IDs to their row indices in the embeddings matrix
artist_index_map = {artist_id: idx for idx, artist_id in enumerate(artist_embeddings_full['artistID'])}

# Example: Get the top 5 similar artists for a specific artist
example_artist_id = 221
top_k_similar_artists = get_top_k_similar_artists(example_artist_id, embedding_sparse_matrix, artist_index_map, k=5)

# Output results
print(f"Top similar artists to artist {example_artist_id}:")
for artist_id, similarity in top_k_similar_artists:
    print(f"Artist: {get_artist_name(artist_id)}, Similarity: {similarity:.4f}")


Top similar artists to artist 221:
Artist: The Kills, Similarity: 0.0268
Artist: Madonna, Similarity: 0.0259
Artist: The Cult, Similarity: 0.0250
Artist: Kasabian, Similarity: 0.0248
Artist: Michelle Williams, Similarity: 0.0201


In [76]:
def get_tag_name(tag_id):
    return tags.loc[tags['tagID'] == tag_id, 'tagValue'].values[0]

In [77]:
def suggest_tags_for_artist(artist_id, embeddings, artist_index_map, artist_tag_distribution, tag_names, k=5, top_tags=5):
    """
    Suggest tags for a given artist based on similar artists' tags.

    Parameters:
    - artist_id (int): The ID of the artist to suggest tags for.
    - embeddings (csr_matrix): The sparse matrix containing artist embeddings.
    - artist_index_map (dict): A mapping of artist IDs to row indices in the embeddings matrix.
    - artist_tag_distribution (pd.DataFrame): Tag distribution (artist x tags).
    - tag_names (pd.DataFrame): Mapping of tag IDs to tag names.
    - k (int): Number of similar artists to consider for tag suggestions.
    - top_tags (int): Number of tags to suggest.

    Returns:
    - List of tuples: [(tag_id, tag_name, relevance_score), ...] for the suggested tags.
    """
    # Ensure the artist ID is in the index map
    if artist_id not in artist_index_map:
        raise ValueError(f"Artist ID {artist_id} not found in the embeddings.")
    
    # Get the top K similar artists
    similar_artists = get_top_k_similar_artists(artist_id, embeddings, artist_index_map, k=k)
    
    # Aggregate tag frequencies from similar artists
    tag_scores = pd.Series(dtype=float)
    for similar_artist_id, similarity in similar_artists:
        if similar_artist_id in artist_tag_distribution['artistID'].values:
            similar_tags = artist_tag_distribution.loc[
                artist_tag_distribution['artistID'] == similar_artist_id
            ].drop(columns=['artistID']).iloc[0]
            # Weighted score for tags based on similarity
            tag_scores = tag_scores.add(similar_tags * similarity, fill_value=0)
    
    # Sort tags by their weighted scores
    suggested_tags = tag_scores.sort_values(ascending=False).head(top_tags)
    
    return suggested_tags.items()

suggested_tags = suggest_tags_for_artist(
    example_artist_id, embedding_sparse_matrix, artist_index_map, artist_tag_distribution, tags, k=10, top_tags=10
)

# Output results
print(f"Suggested tags for artist {example_artist_id}:")
for tag_id, score in suggested_tags:
    tag_name = get_tag_name(tag_id)
    print(f"Tag: {tag_name}, Score: {score:.4f}")

Suggested tags for artist 221:
Tag: pop, Score: 2.2756
Tag: dance, Score: 1.7218
Tag: female vocalists, Score: 1.4053
Tag: indie, Score: 1.2302
Tag: 80s, Score: 1.0840
Tag: electronic, Score: 1.0715
Tag: rock, Score: 0.9333
Tag: alternative, Score: 0.8244
Tag: indie rock, Score: 0.7238
Tag: 90s, Score: 0.6729


In [78]:
from sklearn.decomposition import TruncatedSVD

def reduce_embedding_dimension(embeddings, target_dim=200):
    """
    Reduce the dimension of embeddings to a specified size using Truncated SVD.

    Parameters:
    - embeddings (csr_matrix): Sparse matrix of artist embeddings.
    - target_dim (int): The desired number of dimensions for the reduced embeddings.

    Returns:
    - reduced_embeddings (np.ndarray): Dense array of reduced embeddings.
    """
    # Initialize Truncated SVD
    svd = TruncatedSVD(n_components=target_dim, random_state=42)
    
    # Fit and transform the sparse matrix
    reduced_embeddings = svd.fit_transform(embeddings)
    
    # Explained variance ratio to ensure the quality of reduction
    explained_variance = np.sum(svd.explained_variance_ratio_)
    print(f"Explained variance after reduction: {explained_variance:.4f}")
    
    return reduced_embeddings


# Reduce the embedding space to 200 dimensions
reduced_embeddings = reduce_embedding_dimension(embedding_sparse_matrix, target_dim=200)

# Example: Get reduced embedding for artist ID 2
artist_idx = artist_index_map[example_artist_id]
artist_reduced_embedding = reduced_embeddings[artist_idx]

# Output results
print(f"Reduced embedding for artist ID {example_artist_id}:")
print(artist_reduced_embedding)
print(f"Shape of reduced embedding matrix: {reduced_embeddings.shape}")


Explained variance after reduction: 0.3434
Reduced embedding for artist ID 221:
[ 3.27663044e-04  1.71340024e-04  8.01349483e-04  2.24470574e-04
 -2.39077543e-05  7.84174424e-04  7.50172173e-06  3.10852493e-05
 -1.06969254e-04  1.47692868e-02  2.01094398e-05  6.54847324e-02
 -5.67610833e-04 -1.07471263e-04 -2.26894343e-05 -4.06669174e-05
 -5.86726854e-06 -2.66673411e-03  7.07894524e-05 -9.30910553e-05
 -2.82091042e-03 -2.23281883e-04 -3.64230936e-05  1.08697561e-04
  1.37487257e-04  2.04007434e-05  6.30574062e-04 -1.77361495e-04
 -3.89065732e-04  1.53142479e-03 -1.02622045e-03  3.40753774e-04
 -1.00099655e-04 -9.02488724e-06  5.52115843e-04  1.41962368e-04
  3.41582591e-04 -1.04737328e-03  1.53686872e-03  1.05392071e-03
  5.81825134e-04  7.36540313e-04 -1.07418517e-03 -1.71384826e-04
  7.44782736e-04  7.63983428e-03  1.83397449e-03  9.95477691e-04
 -1.95255781e-03 -2.76314600e-04  1.52857681e-03  1.57823883e-03
  1.64991050e-03 -1.55239587e-03 -1.18222188e-03 -1.49817291e-03
  3.109647

In [79]:
from scipy.sparse import coo_matrix

# Create a sparse matrix for user-artist interactions
user_artist_matrix = coo_matrix(
    (user_artists['weight'], (user_artists['userID'], user_artists['artistID']))
)

# Output the shape of the matrix
print(f"Sparse matrix shape: {user_artist_matrix.shape}")


Sparse matrix shape: (1892, 17632)


In [80]:
from lightfm.data import Dataset

# Initialize the Dataset object
dataset = Dataset(user_identity_features=False, item_identity_features=False)

# Fit the dataset with users and items
# Specify the number of users and items based on the user_artist_matrix
num_users, num_artists = user_artist_matrix.shape
dataset.fit(
    range(num_users),  # User IDs
    range(num_artists)  # Artist IDs
)

# Build interactions and weights matrices
(interactions, weights) = dataset.build_interactions(
    [(row['userID'], row['artistID'], row['weight']) for _, row in user_artists.iterrows()]
)

# Output the shape of the interactions matrix
print(f"Interactions matrix shape: {interactions.shape}")

Interactions matrix shape: (1892, 17632)


In [81]:
from lightfm.cross_validation import random_train_test_split

# Split the interactions into training and testing datasets
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)
trainweighted, testweighted = random_train_test_split(weights, test_percentage=0.2, random_state=42)

# Output the number of interactions in train and test
print(f"Training interactions: {train.getnnz()}")
print(f"Testing interactions: {test.getnnz()}")

Training interactions: 74267
Testing interactions: 18567


In [82]:
from scipy.sparse import csr_matrix

# Ensure the embeddings have the same number of items as in the dataset
item_features = csr_matrix(embedding_sparse_matrix)

# Check the shape of the item_features
print(f"Item features shape: {item_features.shape}")

Item features shape: (17632, 7592)


In [83]:
from lightfm import LightFM

# Initialize the LightFM model
embedding_dim = embedding_sparse_matrix.shape[1]
model = LightFM(no_components=20, loss='warp')

# Train the model for one epoch
model.fit(train, item_features=item_features, epochs=10, num_threads=4)

print("Model training complete.")

Model training complete.


In [87]:
# Evaluate performance
from lightfm.evaluation import auc_score

# Compute and print the AUC score
train_auc = auc_score(model, train, item_features=item_features, num_threads=4).mean()
test_auc = auc_score(model, test, item_features=item_features, num_threads=4).mean()

print(f"Train AUC: {train_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

Train AUC: 0.8843
Test AUC: 0.8550


: 

In [84]:
# Get embeddings from the trained model
item_biases, item_embeddings = model.get_item_representations(features=item_features)

In [43]:
# Get similar artists as above using resulting embeddings
def get_top_k_similar_artists_lightfm(model, artist_id, k=5):
    """
    Get the top K most similar artists to the given artist_id using the LightFM model.

    Parameters:
    - model (LightFM): The trained LightFM model.
    - artist_id (int): The ID of the artist to find similar artists for.
    - k (int): The number of similar artists to return.

    Returns:
    - List of tuples: [(artist_id, similarity), ...] for the top K similar artists.
    """
    # Get the index of the given artist
    artist_idx = artist_index_map[artist_id]
    
    # Compute artist embeddings
    artist_embedding = model.get_item_representations(features=item_features)[1]
    print(artist_embedding.shape)
    
    # Compute cosine similarity between the given artist and all others
    similarities = cosine_similarity([artist_embedding[artist_idx]], artist_embedding).flatten()
    # Get the top K similar indices (excluding itself)
    similar_indices = similarities.argsort()[::-1]  # Sort in descending order
    top_k_indices = [idx for idx in similar_indices if idx != artist_idx][:k]

    # Map indices back to artist IDs and return their similarities
    index_artist_map = {v: k for k, v in artist_index_map.items()}
    top_k_artists = [(index_artist_map[idx], similarities[idx]) for idx in top_k_indices]


    return top_k_artists

In [45]:
similar = get_top_k_similar_artists_lightfm(model, example_artist_id, k=5)
print(similar)

(17632, 20)
[ 0.31058118 -0.07600567 -0.45401764 ...  0.         -0.11866216
  0.        ] [1405, 727, 1795, 505, 593]
0.9812064
[(1405, 0.9812064), (727, 0.9771702), (1795, 0.9683263), (505, 0.96734214), (593, 0.95420647)]


In [46]:
similarNames = [get_artist_name(artist_id) for artist_id, _ in similar]
similarScores = [score for _, score in similar]

# Output results
print(f"Top similar artists to artist {example_artist_id}:")
print(similarNames)

Top similar artists to artist 221:
['Paul McCartney', 'John Lennon', 'T. Rex', 'U2', 'David Bowie']


In [86]:
def get_tag_id_by_name(tag_name):
    return tags.loc[tags['tagValue'] == tag_name, 'tagID'].values[0]

def get_cold_start_similar_artists(model,item_tags, importance_schema='equal'):
    importance_weights = []
    try:
        item_tag_ids = [get_tag_id_by_name(tag) for tag in item_tags]
    except Exception as e:
        raise ValueError(f"Tag not found: {e}")
    
    if importance_schema == 'equal':
        importance_weights = np.ones(shape=len(item_tags))
    elif importance_schema == 'scaled':
        importance_weights = [len(item_tags) - i for i in range(len(item_tags))]
    else:
        raise ValueError(f'schema not found: {importance_schema}')

    normalized_weights = [weight / sum(importance_weights) for weight in importance_weights]

    new_embedding = np.zeros(shape=embedding_dim)
    for tag_id, weight in zip(item_tag_ids, normalized_weights):
        new_embedding[tag_id] = weight

    new_item_sparse = csr_matrix(new_embedding)

    cold_bias, cold_embedding = model.get_item_representations(new_item_sparse)
    sim = pd.DataFrame(cosine_similarity(cold_embedding, item_embeddings).T, columns=(["cosine"]))
    
    sim = sim.sort_values(by="cosine", ascending=False).head(5)

    # Add the artist names to the DataFrame
    sim['artistID'] = sim.index
    sim['artistName'] = sim['artistID'].apply(get_artist_name)

    return sim

new_item_tags = ['rock', 'punk', 'alternative']
similar_cold_start = get_cold_start_similar_artists(model, new_item_tags, importance_schema='scaled')
print(similar_cold_start)



         cosine  artistID              artistName
2319   0.909862      2319           Levent Yüksel
16550  0.894124     16550            Shawn Colvin
9645   0.817164      9645  Die Fantastischen Vier
2307   0.804606      2307             Zerrin Özer
2286   0.796336      2286             Ajda Pekkan


In [93]:
new_item_tags = ['rock', '00s', 'psychedelic', 'classic rock', 'hip hop']
importance_weights = [len(new_item_tags) - i for i in range(len(new_item_tags))]

new_item_tag_ids = [get_tag_id_by_name(tag) for tag in new_item_tags]
new_item_tag_ids

[72, 209, 87, 191, 303]

In [94]:
normalized_weights = [weight / sum(importance_weights) for weight in importance_weights]

# Create the embedding for the new item with weighted tags
new_item = np.zeros((1, embedding_dim))
for tag_id, weight in zip(new_item_tag_ids, normalized_weights):
    new_item[0, tag_id] = weight

assert new_item.shape[1] == embedding_dim
assert np.isclose(new_item.sum(), 1.0)



In [95]:
# convert t to a sparse matrix
new_item_sparse = csr_matrix(new_item)

cold_bias, cold_embedding = model.get_item_representations(new_item_sparse)

In [102]:
sim = pd.DataFrame(cosine_similarity(cold_embedding, item_embeddings).T, columns=(["cosine"]))

In [103]:
sim = sim.sort_values(by="cosine", ascending=False).head(5)

# Add the artist names to the DataFrame
sim['artistID'] = sim.index
sim['artistName'] = sim['artistID'].apply(get_artist_name)

sim

Unnamed: 0,cosine,artistID,artistName
7136,0.953377,7136,Erin McCarley
3608,0.95042,3608,Angus & Julia Stone
12108,0.946691,12108,Ximena Sariñana
10412,0.945458,10412,The Sound of Arrows
5853,0.940304,5853,Neon Trees
