In [1]:
import pandas as pd

artists = pd.read_csv('dataset/artists.dat', sep='\t')
tags = pd.read_csv('dataset/tags.dat', sep='\t', encoding='ISO-8859-1')
user_artists = pd.read_csv('dataset/user_artists.dat', sep='\t')
user_tags = pd.read_csv('dataset/user_taggedartists.dat', sep='\t')

print(f"Original length: {len(user_artists)}")
user_tags.head()

Original length: 92834


Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,41,1,4,2009


In [3]:
class Baseline():
    """
    A model that always recommends the most popular artists.
    """
    def __init__(self, top):
        self.top = top

    def fit(self, user_artists):
        self.popular_artists = user_artists.groupby('artistID').sum().sort_values('weight', ascending=False).index[:self.top]

    def recommend(self, user_id):
        return self.popular_artists[:self.top]

We should also build functions to evaluate the performance of the model. We will use Precision@k, Recall@k and Coverage as the evaluation metrics.

In [4]:
base = Baseline(10)
base.fit(user_artists)
base.recommend(2)

Index([289, 72, 89, 292, 498, 67, 288, 701, 227, 300], dtype='int64', name='artistID')

In [5]:
from sklearn.metrics import roc_auc_score

def is_relevant(recommended, user_id, user_artists):
    """
    Returns a boolean array indicating whether each recommended artist is relevant to the user.
    An artist is relevant if the user has listened to it at least k times.
    """

    user_artists = user_artists[user_artists.userID == user_id]
    
    relevant = []
    for artist_id in recommended:
        relevant.append(artist_id in user_artists.artistID.values)
    return relevant

def precision(recommended, user_id, user_artists):
    """
    Returns the precision of the recommendations.
    Precision is the proportion of recommended artists that are relevant.
    """

    relevant = is_relevant(recommended, user_id, user_artists)
    return sum(relevant) / len(recommended)

def recall(recommended, user_id, user_artists):
    """
    Returns the recall of the recommendations.
    Recall is the proportion of relevant artists that are recommended.
    """

    relevant = is_relevant(recommended, user_id, user_artists)
    return sum(relevant) / len(user_artists[user_artists.userID == user_id])

def f1_score(recommended, user_id, user_artists):
    """
    Returns the F1 score of the recommendations.
    The F1 score is the harmonic mean of precision and recall.
    """

    prec = precision(recommended, user_id, user_artists)
    rec = recall(recommended, user_id, user_artists)

    if prec + rec == 0:
        return 0
    
    return 2 * prec * rec / (prec + rec)

def coverage(recommended, user_artists):
    """
    Returns the coverage of the recommendations.
    Coverage is the proportion of artists that are recommended.
    """

    return len(recommended) / len(user_artists.artistID.unique())

def auc(recommended, user_id, user_artists):
    """
    Returns the AUC (Area Under the Curve) of the recommendations for a specific user.
    AUC measures the ranking quality of recommendations.
    """
    # Get all artists the user has interacted with
    user_artists = user_artists[user_artists.userID == user_id]
    relevant_artists = set(user_artists.artistID.values)

    # Create binary labels: 1 if relevant, 0 if not
    y_true = [1 if artist_id in relevant_artists else 0 for artist_id in recommended]
    y_scores = list(range(len(recommended), 0, -1))  # Higher rank means higher score

    # If there are no relevant artists, return NaN to ignore in the average
    if sum(y_true) == 0:
        return float('nan')

    return roc_auc_score(y_true, y_scores)

def evaluate(model, user_artists, user_ids):
    """
    Evaluates a model on a set of users.
    Returns a dictionary containing the average precision, recall, F1 score, coverage, and AUC.
    """

    model.fit(user_artists)

    precisions = []
    recalls = []
    f1_scores = []
    aucs = []

    recommendations = set()

    for user_id in user_ids:
        recommended = model.recommend(user_id)
        precisions.append(precision(recommended, user_id, user_artists))
        recalls.append(recall(recommended, user_id, user_artists))
        f1_scores.append(f1_score(recommended, user_id, user_artists))
        aucs.append(auc(recommended, user_id, user_artists))
        
        recommendations.update(recommended)

    # Filter out NaN values from AUC
    aucs = [a for a in aucs if not pd.isna(a)]

    return {
        'precision': sum(precisions) / len(precisions),
        'recall': sum(recalls) / len(recalls),
        'f1_score': sum(f1_scores) / len(f1_scores),
        'coverage': len(recommendations) / len(user_artists.artistID.unique()),
        'auc': sum(aucs) / len(aucs) if aucs else float('nan')  # Avoid division by zero
    }
    

In [6]:
evaluate(base, user_artists, user_artists.userID.unique())

{'precision': 0.2328752642706131,
 'recall': 0.048249036830513736,
 'f1_score': 0.07875021852213392,
 'coverage': 0.0005671506352087115,
 'auc': 0.4842685660715725}

Our baseline model gets a Precision@10 of 0.23, and AUC of 0.48