# Matrix Factorization
This notebook explores a matrix factorization approach for the recommender system

In [55]:
import pandas as pd

artists = pd.read_csv('dataset/artists.dat', sep='\t')
tags = pd.read_csv('dataset/tags.dat', sep='\t', encoding='ISO-8859-1')
user_artists = pd.read_csv('dataset/user_artists.dat', sep='\t')
user_tags = pd.read_csv('dataset/user_taggedartists.dat', sep='\t')

print(f"Original length: {len(user_artists)}")


Original length: 92834


In this notebook, we will build a baseline model that recommends the most popular items to all users. We will then build a matrix factorization model using the Alternating Least Squares (ALS) algorithm.

As a convention, we will deal with top 10 recommendations for each user.

In [56]:
class Baseline():
    """
    A model that always recommends the most popular artists.
    """
    def __init__(self, top):
        self.top = top

    def fit(self, user_artists):
        self.popular_artists = user_artists.groupby('artistID').sum().sort_values('weight', ascending=False).index[:self.top]

    def recommend(self, user_id):
        return self.popular_artists[:self.top]

We should also build functions to evaluate the performance of the model. We will use Precision@k, Recall@k and Coverage as the evaluation metrics.

In [57]:
base = Baseline(10)
base.fit(user_artists)
base.recommend(2)

Index([289, 72, 89, 292, 498, 67, 288, 701, 227, 300], dtype='int64', name='artistID')

In [58]:
def is_relevant(recommended, user_id, user_artists):
    """
    Returns a boolean array indicating whether each recommended artist is relevant to the user.
    An artist is relevant if the user has listened to it at least k times.
    """

    user_artists = user_artists[user_artists.userID == user_id]
    
    relevant = []
    for artist_id in recommended:
        relevant.append(artist_id in user_artists.artistID.values)
    return relevant

def precision(recommended, user_id, user_artists):
    """
    Returns the precision of the recommendations.
    Precision is the proportion of recommended artists that are relevant.
    """

    relevant = is_relevant(recommended, user_id, user_artists)
    return sum(relevant) / len(recommended)

def recall(recommended, user_id, user_artists):
    """
    Returns the recall of the recommendations.
    Recall is the proportion of relevant artists that are recommended.
    """

    relevant = is_relevant(recommended, user_id, user_artists)
    return sum(relevant) / len(user_artists[user_artists.userID == user_id])

def f1_score(recommended, user_id, user_artists):
    """
    Returns the F1 score of the recommendations.
    The F1 score is the harmonic mean of precision and recall.
    """

    prec = precision(recommended, user_id, user_artists)
    rec = recall(recommended, user_id, user_artists)

    if prec + rec == 0:
        return 0
    
    return 2 * prec * rec / (prec + rec)

def coverage(recommended, user_artists):
    """
    Returns the coverage of the recommendations.
    Coverage is the proportion of artists that are recommended.
    """

    return len(recommended) / len(user_artists.artistID.unique())


def evaluate(model, user_artists, user_ids):
    """
    Evaluates a model on a set of users.
    Returns a dictionary containing the average precision, recall, F1 score and coverage.
    """

    model.fit(user_artists)


    precisions = []
    recalls = []
    f1_scores = []
    
    recommendations = set()

    for user_id in user_ids:
        recommended = model.recommend(user_id)
        precisions.append(precision(recommended, user_id, user_artists))
        recalls.append(recall(recommended, user_id, user_artists))
        f1_scores.append(f1_score(recommended, user_id, user_artists))
        
        recommendations.update(recommended)

    return {
        'precision': sum(precisions) / len(precisions),
        'recall': sum(recalls) / len(recalls),
        'f1_score': sum(f1_scores) / len(f1_scores),
        'coverage': len(recommendations) / len(user_artists.artistID.unique())
    }
    

In [59]:
evaluate(base, user_artists, user_artists.userID.unique())

{'precision': 0.2328752642706131,
 'recall': 0.048249036830513736,
 'f1_score': 0.07875021852213392,
 'coverage': 0.0005671506352087115}

Our baseline model gets a Precision@10 of 0.23, Recall@10 of 0.04 and F1@10 of 0.07.

# Matrix factorization
Now, we build a matrix factorization model using LightFM.

In [61]:
# Start by generating a sparse matrix of user-artist interactions.
from scipy.sparse import coo_matrix

user_artist_matrix = coo_matrix((user_artists.weight, (user_artists.userID, user_artists.artistID)))
user_artist_matrix.shape

(2101, 18746)

In [63]:
def get_sparsity(matrix):
    """
    Returns the sparsity of a matrix.
    Sparsity is the proportion of zero values in the matrix.
    """

    return 1 - matrix.nnz / (matrix.shape[0] * matrix.shape[1])

print(f"Sparsity: {get_sparsity(user_artist_matrix)}")

Sparsity: 0.9976429304442317
