In [1]:
import model_predict
import os
import numpy as np

from analyze_similars import show_similar
from scipy import sparse
from generate_mtrx import split

In [2]:
dims=200
model_folder = 'models'
split_folder='lastfm'
user_features_filename = 'out_user_features_{}.feats'
item_features_filename = 'out_item_features_{}.feats'
predictions_filename = 'predicted_{}.npy'

# Train/test split

We start by spliting the dataset in train and test. The split method receives a parameter to indicate the percentage that will be in train and in test, in this case we use 20%.

In [3]:
dataset_location = 'data/usersha1-artmbid-artname-plays-part1.tsv'
fan_train, fan_test_data, fan_items_dict, fan_users_dict, fan_item_ids = split(0.2, dataset_location)
fan_train = fan_train.tocsr()

300000it [00:00, 911381.23it/s]


# Train Matrix Factorization model

The next step is to train the model, for that we need to specify the dimension to use in the representation. 

In [24]:
user_features_file = os.path.join(model_folder, split_folder, user_features_filename)
item_features_file = os.path.join(model_folder, split_folder, item_features_filename)

item_ids, item_vecs_reg, user_ids, user_vecs_reg = model_predict.train_als(fan_train, dims, fan_users_dict, fan_items_dict, user_features_file, item_features_file, save_res=False)
#user_ids, user_vecs_reg = model_predict.load_feats(user_features_file)
#item_ids, item_vecs_reg = model_predict.load_feats(item_features_file)

# Compute predictions

After obtaining the representations of the users and the items we can make the predictions.

In [9]:
predictions_file = os.path.join(model_folder, split_folder,predictions_filename)
predicted = model_predict.predict(item_vecs_reg, user_vecs_reg, predictions_file, fan_train, step=500)
#predicted = np.load(predictions_file)

# Evaluation

The next step is to evaluate the results, for that we compare the recomendations with the set set. In this case we use MAP@10, Precision @ 1,3,5,10 r-precision and nDCG@10.

In [13]:
model_predict.show_eval(predicted, fan_test_data, item_ids, fan_train, fan_item_ids)

{'map@10': 0.0732971508192089, 'precision@1': 0.23042579072039548, 'precision@3': 0.19046412867896412, 'precision@5': 0.1683572667225221, 'precision@10': 0.13602367081314143, 'r-precision': 0.13726744757201773, 'ndcg@10': 0.1606607908605957}


# Analysis of the recommendations

Now we take one user (i=10) and we compare what the system recommends with what the user listened

In [12]:
model_predict.show_recs(predicted, fan_test_data, item_ids, fan_train, fan_item_ids, i=10)

---------
Listened (test) ['mew', 'kent', 'the sounds', 'hot chip', 'r.e.m.', 'franz ferdinand', 'oasis', 'timo räisänen', 'placebo', 'lars winnerbäck', 'detektivbyrån']
---------
Listened (train) ['the killers', 'the strokes', 'arcade fire', 'patrick wolf', 'arctic monkeys', 'death cab for cutie', 'coldplay', 'radiohead', 'muse', 'snow patrol', 'the decemberists', 'the beatles', 'madonna', 'hoobastank', 'panic at the disco', 'markus krunegård', 'millencolin', 'tegan and sara', 'säkert!', 'raymond & maria', 'the ark', 'alice in videoland', 'the kooks', 'sahara hotnights', 'david fridlund', 'the knife', 'keane', 'håkan hellström', 'kings of convenience', 'the tough alliance', 'david & the citizens', 'shout out louds', 'slagsmålsklubben', 'mando diao', 'tiger lou', 'the cardigans', 'vapnet', 'bloc party', 'laleh', 'green day', 'deportees', 'firefox ak']
---------
Recommended [('franz ferdinand', True), ('kent', True), ('the shins', False), ('detektivbyrån', True), ('timo räisänen', True)

# Artists similarity

Finally we compute the similarity of the artists using the original placounts matrix. In this case we show the difference of using a euclidean distance with consine distance.

In [6]:
names = {k:v for v,k in enumerate(fan_item_ids)}
artist_name = 'the beatles'
position = [i for i,n in enumerate(fan_item_ids) if n == artist_name][0]

In [7]:
show_similar(fan_train, fan_item_ids, names, position)

SIMILARS ['the beatles', 'john lennon', 'paul mccartney', 'george harrison', 'the who', 'the kinks', 'simon & garfunkel', 'the pillbugs', 'creedence clearwater revival', 'ok go']
SIMILARS ['the beatles', 'john lennon', 'paul mccartney', 'george harrison', 'the who', 'led zeppelin', 'ringo starr', 'the rutles', 'bob dylan', 'paul mccartney & wings']


In [None]:
import pickle
fan_train = sparse.load_npz(os.path.join('data', split_folder, 'fan_train_data.npz')).tocsr()
fan_test_data = pickle.load(open(os.path.join('data', split_folder, 'fan_test_data.pkl'), 'rb'))
fan_items_dict = pickle.load(open(os.path.join('data', split_folder, 'fan_items_dict.pkl'), 'rb'))
item_ids= pickle.load(open(os.path.join('data', split_folder, 'fan_item_ids.pkl'), 'rb'))
fan_users_dict = pickle.load(open(os.path.join('data', split_folder,'fan_users_dict.pkl'), 'rb'))