In [17]:
import pandas as pd

artists = pd.read_csv('dataset/remapped/artists.csv')
tags = pd.read_csv('dataset/remapped/tags.csv')
user_artists = pd.read_csv('dataset/remapped/user_artists.csv')
user_tags = pd.read_csv('dataset/remapped/user_tags.csv')

In [18]:
import numpy as np
from scipy.sparse import csr_matrix

# Assuming user_tags, tags, and artists DataFrames are already loaded

# 1. Get the distribution of tags for each artist
artist_tag_distribution = (
    user_tags.groupby(['artistID', 'tagID'])
    .size()
    .unstack(fill_value=0)  # Converts to wide format with tagIDs as columns
)
artist_tag_distribution.columns.name = None  # Remove column name for clarity
artist_tag_distribution.reset_index(inplace=True)  # Make artistID a regular column

artist_tag_distribution.shape



(12133, 9719)

In [19]:

# 2. Build bag-of-words embeddings
# Merge `user_tags` with `tags` to get the tag values
user_tags_with_values = user_tags.merge(tags, how='left', left_on='tagID', right_on='tagID')

# Combine all tag values for each artist
artist_bow = user_tags_with_values.groupby('artistID')['tagValue'].apply(
    lambda x: ' '.join(map(str, x))
).reset_index()


artist_bow.head()

Unnamed: 0,artistID,tagValue
0,0,weeabo jrock j-rock visual kei better than lad...
1,1,german seen live darkwave industrial german ge...
2,2,black metal black metal norwegian black metal ...
3,3,j-rock visual kei metal gothic japanese bazaro...
4,4,gothic gothic rock darkwave darkwave deathrock...


In [20]:

# Vectorize using CountVectorizer to build bag-of-words embeddings
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(artist_bow['tagValue'])

embedding_dim = len(vectorizer.get_feature_names_out())
print(f"Embedding dimension: {embedding_dim}")

Embedding dimension: 7592


In [21]:
# Create a DataFrame with bag-of-words embeddings
artist_embeddings = pd.DataFrame.sparse.from_spmatrix(
    bow_matrix, columns=vectorizer.get_feature_names_out(), index=artist_bow['artistID']
)

artist_embeddings.shape


(12133, 7592)

In [22]:
# Normalize rows in the embedding matrix
artist_embeddings = artist_embeddings.div(artist_embeddings.sum(axis=0), axis=1)

In [23]:
# 3. Build a sparse matrix of shape (artist_count, embedding_dim)
# Ensure all artists have rows in the sparse matrix
all_artists = artists[['id']].rename(columns={'id': 'artistID'})
artist_embeddings_full = all_artists.merge(artist_embeddings, how='left', on='artistID').fillna(0)

# Convert to sparse matrix
embedding_sparse_matrix = csr_matrix(artist_embeddings_full.drop(columns=['artistID']).values)

# Output results
print("Tag distribution by artist:")
print(artist_tag_distribution)

print("\nBag-of-Words Embeddings for artists:")
print(artist_embeddings)

print("\nSparse matrix shape (artist_count x embedding_dim):", embedding_sparse_matrix.shape)


KeyboardInterrupt: 

In [8]:
def get_artist_name(artist_id):
    return artists.loc[artists['id'] == artist_id, 'name'].values[0]

In [9]:
from scipy.sparse import coo_matrix

# Create a sparse matrix for user-artist interactions
user_artist_matrix = coo_matrix(
    (user_artists['weight'], (user_artists['userID'], user_artists['artistID']))
)

# Output the shape of the matrix
print(f"Sparse matrix shape: {user_artist_matrix.shape}")

item_features = csr_matrix(embedding_sparse_matrix)



Sparse matrix shape: (1892, 17632)


In [10]:
from lightfm.data import Dataset

# Initialize the Dataset object
dataset = Dataset(user_identity_features=False, item_identity_features=False)

# Fit the dataset with users and items
# Specify the number of users and items based on the user_artist_matrix
num_users, num_artists = user_artist_matrix.shape
dataset.fit(
    range(num_users),  # User IDs
    range(num_artists)  # Artist IDs
)

# Build interactions and weights matrices
(interactions, weights) = dataset.build_interactions(
    [(row['userID'], row['artistID'], row['weight']) for _, row in user_artists.iterrows()]
)

# Output the shape of the interactions matrix
print(f"Interactions matrix shape: {interactions.shape}")

Interactions matrix shape: (1892, 17632)


In [11]:
seed = 42

from lightfm.cross_validation import random_train_test_split

# Split the interactions into training and testing datasets
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=seed)
trainweighted, testweighted = random_train_test_split(weights, test_percentage=0.2, random_state=seed)

# Output the number of interactions in train and test
print(f"Training interactions: {train.getnnz()}")
print(f"Testing interactions: {test.getnnz()}")

Training interactions: 74267
Testing interactions: 18567


In [14]:
from lightfm import LightFM
from lightfm.evaluation import auc_score

# Train a model to check sanity
model = LightFM(loss='warp')
model.fit(train, epochs=10, num_threads=4,item_features=item_features,verbose=True)
auc_score(model, test, train_interactions=train,item_features=item_features).mean()


Epoch: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it]


0.84691435

In [15]:
%pip install optuna
import optuna

def objective(trial):
    seed = 123
    tr, val = random_train_test_split(train, test_percentage=0.2, random_state=seed)


    param = {
        'no_components': trial.suggest_int("no_components", 5, 64),
        "learning_schedule": trial.suggest_categorical("learning_schedule", ["adagrad"]),
        "loss":  trial.suggest_categorical("loss", ["bpr", "warp", "warp-kos"]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 1),
        "item_alpha": trial.suggest_float("item_alpha", 1e-10, 1e-06, log=True),
        "user_alpha": trial.suggest_float("user_alpha", 1e-10, 1e-06, log=True), 
    }
    epochs = trial.suggest_int("epochs", 20, 50)

    model = LightFM(**param, random_state=seed)

    model.fit(tr,epochs=epochs, num_threads=4, item_features=item_features,verbose=True)

    print('Evaluating AUC')
    val_auc = auc_score(model, val, train_interactions=tr, num_threads=4, item_features=item_features).mean()

    return val_auc

study = optuna.create_study(direction="maximize")

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-12-15 18:59:16,756] A new study created in memory with name: no-name-57bdf4ca-a05c-4b66-88dd-f207e768704b


In [24]:
study.optimize(objective, n_trials=20)

best_params = study.best_params

# Save the best parameters to a file
with open('best_params.txt', 'w') as f:
    f.write(str(best_params))

for k,v in best_params.items():
    print(f"{k}: {v}")

[W 2024-12-15 19:00:11,694] Trial 1 failed with parameters: {'no_components': 7, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.155253189916375, 'item_alpha': 8.920029995144322e-09, 'user_alpha': 2.4154893481486756e-09, 'epochs': 42} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/andres/.local/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_48733/4138094250.py", line 21, in objective
    model.fit(tr,epochs=epochs, num_threads=4, item_features=item_features)
  File "/home/andres/.local/lib/python3.12/site-packages/lightfm/lightfm.py", line 550, in fit
    return self.fit_partial(
           ^^^^^^^^^^^^^^^^^
  File "/home/andres/.local/lib/python3.12/site-packages/lightfm/lightfm.py", line 655, in fit_partial
    self._run_epoch(
  File "/home/andres/.local/lib/python3.12/site-packages/

KeyboardInterrupt: 

In [None]:
optuna.importance.get_param_importances(study)

In [None]:
# Tidy up epochs as not a parameter to be passed to LightFM() directly
num_epochs = best_params['epochs'] # save best epochs as a separate object
del best_params['epochs'] # then remove it from best_params object

# Train with the best parameters
model = LightFM(**best_params, random_state=123)

model.fit(train, 
          item_features=item_features,
          epochs = num_epochs,
          verbose=True)

In [None]:
%pip install pickle
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Evaluate the trained model
train_auc = auc_score(model, train, item_features=item_features).mean()
test_auc = auc_score(model, test, item_features=item_features).mean()

print(f"Train AUC: {train_auc:.2f}")
print(f"Test AUC: {test_auc:.2f}")