In [1]:
import pandas as pd

artists = pd.read_csv('dataset/remapped/artists.csv')
tags = pd.read_csv('dataset/remapped/tags.csv')
user_artists = pd.read_csv('dataset/remapped/user_artists.csv')
user_tags = pd.read_csv('dataset/remapped/user_tags.csv')

print(f"Original length: {len(user_artists)}")
user_artists.head()

Original length: 92834


Unnamed: 0,userID,artistID,weight
0,0,45,13883
1,0,46,11690
2,0,47,11351
3,0,48,10300
4,0,49,8983


In [2]:
import numpy as np
from scipy.sparse import csr_matrix

# Assuming user_tags, tags, and artists DataFrames are already loaded

# Get the distribution of tags for each artist
artist_tag_distribution = (
    user_tags.groupby(['artistID', 'tagID'])
    .size()
    .unstack(fill_value=0)  # Converts to wide format with tagIDs as columns
)
artist_tag_distribution.columns.name = None  # Remove column name for clarity
artist_tag_distribution.reset_index(inplace=True)  # Make artistID a regular column

artist_tag_distribution.shape

(12133, 9719)

In [3]:

# 2. Build bag-of-words embeddings
# Merge `user_tags` with `tags` to get the tag values
user_tags_with_values = user_tags.merge(tags, how='left', left_on='tagID', right_on='tagID')

# Combine all tag values for each artist
artist_bow = user_tags_with_values.groupby('artistID')['tagValue'].apply(
    lambda x: ' '.join(map(str, x))
).reset_index()


artist_bow.head()

Unnamed: 0,artistID,tagValue
0,0,weeabo jrock j-rock visual kei better than lad...
1,1,german seen live darkwave industrial german ge...
2,2,black metal black metal norwegian black metal ...
3,3,j-rock visual kei metal gothic japanese bazaro...
4,4,gothic gothic rock darkwave darkwave deathrock...


In [4]:

# Vectorize using CountVectorizer to build bag-of-words embeddings
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(artist_bow['tagValue'])

embedding_dim = len(vectorizer.get_feature_names_out())
print(f"Embedding dimension: {embedding_dim}")

Embedding dimension: 7592


In [5]:
# Create a DataFrame with bag-of-words embeddings
artist_embeddings = pd.DataFrame.sparse.from_spmatrix(
    bow_matrix, columns=vectorizer.get_feature_names_out(), index=artist_bow['artistID']
)



In [6]:
# 3. Build a sparse matrix of shape (artist_count, embedding_dim)
# Ensure all artists have rows in the sparse matrix
all_artists = artists[['id']].rename(columns={'id': 'artistID'})
artist_embeddings_full = all_artists.merge(artist_embeddings, how='left', on='artistID').fillna(0)

# Convert to sparse matrix
embedding_sparse_matrix = csr_matrix(artist_embeddings_full.drop(columns=['artistID']).values)

# Output results
print("Tag distribution by artist:")
print(artist_tag_distribution)

print("\nBag-of-Words Embeddings for artists:")
print(artist_embeddings)

print("\nSparse matrix shape (artist_count x embedding_dim):", embedding_sparse_matrix.shape)


Tag distribution by artist:
       artistID  0  1  2  3  4  5  6  7  8  ...  11935  11936  11937  11938  \
0             0  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
1             1  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
2             2  0  0  0  3  0  0  0  0  0  ...      0      0      0      0   
3             3  2  0  0  0  0  0  1  0  0  ...      0      0      0      0   
4             4  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
...         ... .. .. .. .. .. .. .. .. ..  ...    ...    ...    ...    ...   
12128     17623  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12129     17625  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12130     17626  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12131     17627  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   
12132     17630  0  0  0  0  0  0  0  0  0  ...      0      0      0      0   

       11939  11940  11

In [7]:
def get_artist_name(artist_id):
    return artists.loc[artists['id'] == artist_id, 'name'].values[0]

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_similar_artists(artist_id, embeddings, artist_index_map, k=5):
    """
    Get the top K most similar artists to the given artist_id.

    Parameters:
    - artist_id (int): The ID of the artist to find similar artists for.
    - embeddings (csr_matrix): The sparse matrix containing artist embeddings.
    - artist_index_map (dict): A mapping of artist IDs to row indices in the embeddings matrix.
    - k (int): The number of similar artists to return.

    Returns:
    - List of tuples: [(artist_id, similarity), ...] for the top K similar artists.
    """
    # Ensure the artist ID is in the index map
    if artist_id not in artist_index_map:
        raise ValueError(f"Artist ID {artist_id} not found in the embeddings.")

    # Get the index of the given artist
    artist_idx = artist_index_map[artist_id]
    
    # Compute cosine similarity between the given artist and all others
    artist_vector = embeddings[artist_idx]  # Sparse row for the given artist
    similarities = cosine_similarity(artist_vector, embeddings).flatten()
    
    # Get the top K similar indices (excluding itself)
    similar_indices = similarities.argsort()[::-1]  # Sort in descending order
    top_k_indices = [idx for idx in similar_indices if idx != artist_idx][:k]
    
    # Map indices back to artist IDs and return their similarities
    index_artist_map = {v: k for k, v in artist_index_map.items()}
    top_k_artists = [(index_artist_map[idx], similarities[idx]) for idx in top_k_indices]

    return top_k_artists

# Create a mapping of artist IDs to their row indices in the embeddings matrix
artist_index_map = {artist_id: idx for idx, artist_id in enumerate(artist_embeddings_full['artistID'])}

# Example: Get the top 5 similar artists for a specific artist
example_artist_id = 221
top_k_similar_artists = get_top_k_similar_artists(example_artist_id, embedding_sparse_matrix, artist_index_map, k=5)

# Output results
print(f"Top similar artists to artist {example_artist_id}:")
for artist_id, similarity in top_k_similar_artists:
    print(f"Artist: {get_artist_name(artist_id)}, Similarity: {similarity:.4f}")


Top similar artists to artist 221:
Artist: The Kinks, Similarity: 0.9475
Artist: The Rolling Stones, Similarity: 0.9469
Artist: The Who, Similarity: 0.9379
Artist: The Moody Blues, Similarity: 0.9302
Artist: The Animals, Similarity: 0.9216


In [9]:
def get_tag_name(tag_id):
    return tags.loc[tags['tagID'] == tag_id, 'tagValue'].values[0]

In [10]:
def suggest_tags_for_artist(artist_id, embeddings, artist_index_map, artist_tag_distribution, tag_names, k=5, top_tags=5):
    """
    Suggest tags for a given artist based on similar artists' tags.

    Parameters:
    - artist_id (int): The ID of the artist to suggest tags for.
    - embeddings (csr_matrix): The sparse matrix containing artist embeddings.
    - artist_index_map (dict): A mapping of artist IDs to row indices in the embeddings matrix.
    - artist_tag_distribution (pd.DataFrame): Tag distribution (artist x tags).
    - tag_names (pd.DataFrame): Mapping of tag IDs to tag names.
    - k (int): Number of similar artists to consider for tag suggestions.
    - top_tags (int): Number of tags to suggest.

    Returns:
    - List of tuples: [(tag_id, tag_name, relevance_score), ...] for the suggested tags.
    """
    # Ensure the artist ID is in the index map
    if artist_id not in artist_index_map:
        raise ValueError(f"Artist ID {artist_id} not found in the embeddings.")
    
    # Get the top K similar artists
    similar_artists = get_top_k_similar_artists(artist_id, embeddings, artist_index_map, k=k)
    
    # Aggregate tag frequencies from similar artists
    tag_scores = pd.Series(dtype=float)
    for similar_artist_id, similarity in similar_artists:
        if similar_artist_id in artist_tag_distribution['artistID'].values:
            similar_tags = artist_tag_distribution.loc[
                artist_tag_distribution['artistID'] == similar_artist_id
            ].drop(columns=['artistID']).iloc[0]
            # Weighted score for tags based on similarity
            tag_scores = tag_scores.add(similar_tags * similarity, fill_value=0)
    
    # Sort tags by their weighted scores
    suggested_tags = tag_scores.sort_values(ascending=False).head(top_tags)
    
    return suggested_tags.items()

suggested_tags = suggest_tags_for_artist(
    example_artist_id, embedding_sparse_matrix, artist_index_map, artist_tag_distribution, tags, k=10, top_tags=10
)

# Output results
print(f"Suggested tags for artist {example_artist_id}:")
for tag_id, score in suggested_tags:
    tag_name = get_tag_name(tag_id)
    print(f"Tag: {tag_name}, Score: {score:.4f}")

Suggested tags for artist 221:
Tag: classic rock, Score: 187.8929
Tag: rock, Score: 120.9716
Tag: 60s, Score: 76.5420
Tag: british, Score: 75.8443
Tag: 70s, Score: 46.9514
Tag: singer-songwriter, Score: 27.9086
Tag: hard rock, Score: 24.3722
Tag: pop, Score: 23.7341
Tag: blues, Score: 16.8002
Tag: rock and roll, Score: 13.9296


In [11]:
from sklearn.decomposition import TruncatedSVD

def reduce_embedding_dimension(embeddings, target_dim=200):
    """
    Reduce the dimension of embeddings to a specified size using Truncated SVD.

    Parameters:
    - embeddings (csr_matrix): Sparse matrix of artist embeddings.
    - target_dim (int): The desired number of dimensions for the reduced embeddings.

    Returns:
    - reduced_embeddings (np.ndarray): Dense array of reduced embeddings.
    """
    # Initialize Truncated SVD
    svd = TruncatedSVD(n_components=target_dim, random_state=42)
    
    # Fit and transform the sparse matrix
    reduced_embeddings = svd.fit_transform(embeddings)
    
    # Explained variance ratio to ensure the quality of reduction
    explained_variance = np.sum(svd.explained_variance_ratio_)
    print(f"Explained variance after reduction: {explained_variance:.4f}")
    
    return reduced_embeddings


# Reduce the embedding space to 200 dimensions
reduced_embeddings = reduce_embedding_dimension(embedding_sparse_matrix, target_dim=200)

# Example: Get reduced embedding for artist ID 2
artist_idx = artist_index_map[example_artist_id]
artist_reduced_embedding = reduced_embeddings[artist_idx]

# Output results
print(f"Reduced embedding for artist ID {example_artist_id}:")
print(artist_reduced_embedding)
print(f"Shape of reduced embedding matrix: {reduced_embeddings.shape}")


Explained variance after reduction: 0.9778
Reduced embedding for artist ID 221:
[ 1.99156984e+02 -8.09605989e+00 -2.07340201e+01 -7.98043906e+01
  2.24498500e+00 -3.77623769e+01 -1.66087292e+01  1.06037363e+01
  3.80467758e+00 -8.15879906e+00  3.06396838e+01 -4.91644535e+01
  2.33870321e+01  1.20293662e+01 -2.07027584e+01 -3.21609714e+01
 -8.52783568e+00 -6.54970041e+00 -3.60944579e+00 -6.10564733e+00
 -9.79286442e+00  5.88707991e+00 -6.97626061e+00 -4.35303494e+00
  8.36896858e+00 -5.73532328e+00 -4.19163054e-01  7.72675663e+00
  1.28197036e+00 -1.53928446e+00 -1.45813747e+00  5.10941555e+00
  2.70427719e+00 -5.33667038e+00 -3.02804464e-01  4.94388017e+00
 -3.98026503e+00  4.29310320e+00 -1.01953643e+01  8.30274162e+00
  6.14296966e+00  3.74565172e+00  5.91632560e-01 -4.52687227e+00
  6.86095562e+00 -9.57254355e+00 -1.98814995e+00 -1.80982428e+00
  1.82833230e-01  5.09742498e-01 -1.27014056e-01 -3.91763293e+00
  7.99440808e-01  1.53655097e+00  2.78649642e+00 -1.23896311e+00
  1.175655

In [12]:
from scipy.sparse import coo_matrix

# Create a sparse matrix for user-artist interactions
user_artist_matrix = coo_matrix(
    (user_artists['weight'], (user_artists['userID'], user_artists['artistID']))
)

# Output the shape of the matrix
print(f"Sparse matrix shape: {user_artist_matrix.shape}")


Sparse matrix shape: (1892, 17632)


In [13]:
from lightfm.data import Dataset

# Initialize the Dataset object
dataset = Dataset(user_identity_features=False, item_identity_features=False)

# Fit the dataset with users and items
# Specify the number of users and items based on the user_artist_matrix
num_users, num_artists = user_artist_matrix.shape
dataset.fit(
    range(num_users),  # User IDs
    range(num_artists)  # Artist IDs
)

# Build interactions and weights matrices
(interactions, weights) = dataset.build_interactions(
    [(row['userID'], row['artistID'], row['weight']) for _, row in user_artists.iterrows()]
)

# Output the shape of the interactions matrix
print(f"Interactions matrix shape: {interactions.shape}")

Interactions matrix shape: (1892, 17632)


In [14]:
from lightfm.cross_validation import random_train_test_split

# Split the interactions into training and testing datasets
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)
trainweighted, testweighted = random_train_test_split(weights, test_percentage=0.2, random_state=42)

# Output the number of interactions in train and test
print(f"Training interactions: {train.getnnz()}")
print(f"Testing interactions: {test.getnnz()}")

Training interactions: 74267
Testing interactions: 18567


In [15]:
from scipy.sparse import csr_matrix

# Ensure the embeddings have the same number of items as in the dataset
item_features = csr_matrix(embedding_sparse_matrix)

# Check the shape of the item_features
print(f"Item features shape: {item_features.shape}")

Item features shape: (17632, 7592)


In [16]:
from lightfm import LightFM

# Initialize the LightFM model
embedding_dim = embedding_sparse_matrix.shape[1]
model = LightFM(no_components=20, loss='warp')

# Train the model for one epoch
model.fit(train, item_features=item_features, epochs=1, num_threads=4)

print("Model training complete.")

Model training complete.


In [30]:
def fitModels(epochs, no_components, weights,item_features, train):
    models = {}

    number_of_models = len(epochs) * len(no_components) * 4
    i = 0
    for epoch in epochs:
        for component in no_components:
            # Both
            print(f"Training model with {component} components and {epoch} epochs ({i}/{number_of_models})")
            model = LightFM(no_components=component, loss='warp')
            model.fit(weights,train, item_features=item_features, epochs=epoch, num_threads=4,verbose=True)
            models[(epoch, component,i%4)] = model
            i += 1

            # Only item features
            print(f"Training model with {component} components and {epoch} epochs ({i}/{number_of_models})")
            model = LightFM(no_components=component, loss='warp')
            model.fit(train, item_features=item_features, epochs=epoch, num_threads=4,verbose=True)
            models[(epoch, component, i%4)] = model
            i += 1

            # Only sample weights
            print(f"Training model with {component} components and {epoch} epochs ({i}/{number_of_models})")
            model = LightFM(no_components=component, loss='warp')
            model.fit(weights,train, epochs=epoch, num_threads=4,verbose=True)
            models[(epoch, component, i%4)] = model
            i += 1

            # No item features or sample weights
            print(f"Training model with {component} components and {epoch} epochs ({i}/{number_of_models})")
            model = LightFM(no_components=component, loss='warp')
            model.fit(train, epochs=epoch, num_threads=4,verbose=True)
            models[(epoch, component,i%4)] = model
            i += 1


    return models



In [31]:
epochs = [1]
no_components = [5,10]

models = fitModels(epochs, no_components, weights, item_features, train)

Training model with 5 components and 1 epochs (0/8)


Epoch: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Training model with 5 components and 1 epochs (1/8)


Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.50s/it]


Training model with 5 components and 1 epochs (2/8)


Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]


Training model with 5 components and 1 epochs (3/8)


Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


Training model with 10 components and 1 epochs (4/8)


Epoch: 100%|██████████| 1/1 [00:10<00:00, 10.06s/it]


Training model with 10 components and 1 epochs (5/8)


Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.42s/it]


Training model with 10 components and 1 epochs (6/8)


Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.90s/it]


Training model with 10 components and 1 epochs (7/8)


Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


In [32]:
from lightfm.evaluation import auc_score

def evaluateModel(model, test, train, item_features):

    # Compute and print the AUC score
    train_auc = auc_score(model, train, item_features=item_features).mean()
    test_auc = auc_score(model, test, train_interactions=train, item_features=item_features).mean()

    return train_auc, test_auc

In [33]:
print(models)

{(1, 5, 0): <lightfm.lightfm.LightFM object at 0x7fe990c497f0>, (1, 5, 1): <lightfm.lightfm.LightFM object at 0x7fe9938e42f0>, (1, 5, 2): <lightfm.lightfm.LightFM object at 0x7fe9d94c3800>, (1, 5, 3): <lightfm.lightfm.LightFM object at 0x7fe9da8eb2f0>, (1, 10, 0): <lightfm.lightfm.LightFM object at 0x7fe9938e4170>, (1, 10, 1): <lightfm.lightfm.LightFM object at 0x7fe9da9b3b30>, (1, 10, 2): <lightfm.lightfm.LightFM object at 0x7fe9d9d49af0>, (1, 10, 3): <lightfm.lightfm.LightFM object at 0x7fe9da8f3bc0>}


In [35]:
import concurrent.futures

def evaluate_model_parallel(key):
    train_auc, test_auc = evaluateModel(models[key], test, train, item_features)
    return key, train_auc, test_auc

# Evaluate all models in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(evaluate_model_parallel, key) for key in models]
    for future in concurrent.futures.as_completed(futures):
        key, train_auc, test_auc = future.result()
        print(f"Model {key} - Train AUC: {train_auc:.4f}, Test AUC: {test_auc:.4f}")

ValueError: Incorrect number of features in item_features