In [1]:
import pandas as pd
import numpy as np

In [2]:
plays = pd.read_csv('data/lastfm/user_artists.dat', sep='\t')
artists = pd.read_csv('data/lastfm/artists.dat', sep='\t', usecols=['id','name'])

# Merge artist and user pref data
ap = pd.merge(artists, plays, how="inner", left_on="id", right_on="artistID")
ap = ap.rename(columns={"weight": "playCount"})

# Group artist by name
artist_rank = ap.groupby(['name']) \
    .agg({'userID' : 'count', 'playCount' : 'sum'}) \
    .rename(columns={"userID" : 'totalUsers', "playCount" : "totalPlays"}) \
    .sort_values(['totalPlays'], ascending=False)

artist_rank['avgPlays'] = artist_rank['totalPlays'] / artist_rank['totalUsers']
#print(artist_rank)
print(ap)

          id              name  userID  artistID  playCount
0          1      MALICE MIZER      34         1        212
1          1      MALICE MIZER     274         1        483
2          1      MALICE MIZER     785         1         76
3          2   Diary of Dreams     135         2       1021
4          2   Diary of Dreams     257         2        152
...      ...               ...     ...       ...        ...
92829  18741    Diamanda Galás     454     18741        301
92830  18742            Aya RL     454     18742        294
92831  18743       Coptic Rain     454     18743        287
92832  18744      Oz Alchemist     454     18744        286
92833  18745  Grzegorz Tomczak     585     18745        426

[92834 rows x 5 columns]


In [3]:
# Merge into ap matrix
#print(ap)
ap = ap.join(artist_rank, on="name", how="inner").sort_values(['playCount'], ascending=False)

# Preprocessing
pc = ap.playCount
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())
ap = ap.assign(playCountScaled=play_count_scaled)
print(ap)

# Build a user-artist rating matrix 
ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
ratings = ratings_df.fillna(0).values

# Show sparsity
sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100


          id             name  userID  artistID  playCount  totalUsers  \
2800      72     Depeche Mode    1642        72     352698         282   
35843    792           Thalía    2071       792     324663          26   
27302    511               U2    1094       511     320725         185   
8152     203             Blur    1905       203     257978         114   
26670    498         Paramore    1664       498     227829         399   
...      ...              ...     ...       ...        ...         ...   
38688    913  Destiny's Child    1810       913          1          83   
32955    697              Sia    1290       697          1          56   
71811   4988   Chris Spheeris     510      4988          1           5   
91319  17080      Haylie Duff    1851     17080          1           1   
63982   3201        Kate Bush     344      3201          1          42   

       totalPlays      avgPlays  playCountScaled  
2800      1301308   4614.567376         1.000000  
35843    

In [22]:
print("sparsity: %.2f" % sparsity)

sparsity: 0.28


In [52]:
print(ratings_df)

artistID  1      2         3      4      5         6      7      8      9      \
userID                                                                          
2           NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
3           NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
4           NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
5           NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
6           NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
...         ...    ...       ...    ...    ...       ...    ...    ...    ...   
2095        NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
2096        NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
2097        NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
2099        NaN    NaN       NaN    NaN    NaN       NaN    NaN    NaN    NaN   
2100        NaN    NaN  0.00

In [5]:
from scipy.sparse import csr_matrix

# Build a sparse matrix
X = csr_matrix(ratings)

n_users, n_items = ratings_df.shape
print("rating matrix shape", ratings_df.shape)

user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()

rating matrix shape (1892, 17632)


In [6]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset

# Build data references + train test
Xcoo = X.tocoo()
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
train, test = random_train_test_split(interactions)

# Ignore that (weight seems to be ignored...)
#train = train_.tocsr()
#test = test_.tocsr()
#train[train==1] = X[train==1]
#test[test==1] = X[test==1]

interactions

<1892x17632 sparse matrix of type '<class 'numpy.int32'>'
	with 92198 stored elements in COOrdinate format>

### WARP loss function

In [7]:
# Train
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7eff6ab908e0>

In [8]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()
train_recall = recall_at_k(model, train).mean()
test_recall = recall_at_k(model, test, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('Recall: train %.2f, test %.2f.' % (train_recall, test_recall))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.37, test 0.13.
Recall: train 0.10, test 0.13.
AUC: train 0.96, test 0.85.


In [9]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['The Beatles' 'Coldplay' 'Depeche Mode' ... 'Krusha' 'ScreamerClauz'
 'Peter Kurten']


# Loss function selection

In [10]:
def get_scores(loss):
    modelx = LightFM(learning_rate=0.05, loss=loss)
    modelx.fit(train, epochs=10, num_threads=2)
    train_precisionx = precision_at_k(modelx, train, k=10).mean()
    test_precisionx = precision_at_k(modelx, test, k=10, train_interactions=train).mean()
    train_recallx = recall_at_k(modelx, train).mean()
    test_recallx = recall_at_k(modelx, test, train_interactions=train).mean()
    train_aucx = auc_score(modelx, train).mean()
    test_aucx = auc_score(modelx, test, train_interactions=train).mean()
    scoresx = modelx.predict(0, np.arange(n_items))
    top_itemsx = artist_names[np.argsort(-scoresx)]
    print(loss,'Precision: \t\ttrain %.2f, test %.2f.' % (train_precisionx, test_precisionx))
    print(loss,'Recall: \t\ttrain %.2f, test %.2f.' % (train_recallx, test_recallx))
    print(loss,'AUC: \t\t\ttrain %.2f, test %.2f.' % (train_aucx, test_aucx))
    #print(top_itemsx)
    return top_itemsx

In [11]:
warpitems = get_scores('warp')
logitems = get_scores('logistic')
bpritems = get_scores('bpr')
warpkositems = get_scores('warp-kos')

warp Precision: 		train 0.39, test 0.13.
warp Recall: 		train 0.10, test 0.14.
warp AUC: 			train 0.97, test 0.86.
logistic Precision: 		train 0.20, test 0.07.
logistic Recall: 		train 0.05, test 0.07.
logistic AUC: 			train 0.89, test 0.81.
bpr Precision: 		train 0.36, test 0.12.
bpr Recall: 		train 0.09, test 0.12.
bpr AUC: 			train 0.85, test 0.78.
warp-kos Precision: 		train 0.35, test 0.13.
warp-kos Recall: 		train 0.09, test 0.13.
warp-kos AUC: 			train 0.89, test 0.82.


### get_recommendation

In [81]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel

def get_recommendation(userid):
    X = csr_matrix(ratings)
    svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=0)
    X_matrix_svd = svd.fit_transform(X)
    cosine_sim = linear_kernel(X_matrix_svd,X_matrix_svd[userid].reshape(1,-1))

    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    user_indices = [i[0] for i in sim_scores]
    artist_top = []
    for i in user_indices:
        idx_i = np.argmax(ratings[i])
        if len(ap['name'][ap['artistID'] == idx_i].index)== 1:            
            if ap['name'][ap['artistID'] == idx_i].unique()[0] not in artist_top:
                artist_top.append( ap['name'][ap['artistID'] == idx_i].unique()[0] )
    return artist_top

In [82]:
get_recommendation(1)

['The Easybeats',
 'Luxuria',
 'Bruce Faulconer',
 'One Day as a Lion',
 'Hellmouth',
 'Andrew Lawson',
 'Hird',
 'Groovie Ghoulies',
 'cokiyu',
 'James Cotton',
 'A Song For You My Dear',
 'Multigen',
 'Dikers']

In [19]:
#def get_ground_truth(user)
    

# Implicit Vs Explicit

Implicit model:

All interactions in the training matrix are treated as positive signals, and products that users did not interact with they implicitly do not like. The goal of the model is to score these implicit positives highly while assigining low scores to implicit negatives

Explicit model:



# Grid Search

In [20]:
def gridsearchLFM(loss, learning_rate, epoch, score):
    res = {}
    for l1 in loss:
        for lr in learning_rate:
            for ep in epoch:
                testID = score+'_'+str(l1)+'_'+str(lr)+'_'+str(ep)
                modelx = LightFM(learning_rate=lr, loss=l1)
                modelx.fit(train, epochs=ep, num_threads=2)
                precision = precision_at_k(modelx, train, k=10).mean()
                recall = recall_at_k(modelx, train).mean()
                auc = auc_score(modelx, train).mean()
                
                res[testID] = (precision, recall, auc)

                if score == 'precision':
                    print(testID,' \t\t\t %.2f' % (precision))
                elif score == 'recall':
                    print(testID,' \t\t\t %.2f' % (recall))
                else:
                    print(testID,' \t\t\t %.2f' % (auc))
    return res

In [21]:
result = gridsearchLFM(loss = ['warp','warp-kos','bpr'], learning_rate = [0.01,0.05,0.1,0.2], epoch = [5,10,20], score = 'auc')

auc_warp_0.01_5  			 0.87
auc_warp_0.01_10  			 0.89
auc_warp_0.01_20  			 0.90
auc_warp_0.05_5  			 0.94
auc_warp_0.05_10  			 0.96
auc_warp_0.05_20  			 0.98
auc_warp_0.1_5  			 0.97
auc_warp_0.1_10  			 0.98
auc_warp_0.1_20  			 0.99
auc_warp_0.2_5  			 0.76
auc_warp_0.2_10  			 0.86
auc_warp_0.2_20  			 0.86
auc_warp-kos_0.01_5  			 0.82
auc_warp-kos_0.01_10  			 0.84
auc_warp-kos_0.01_20  			 0.85
auc_warp-kos_0.05_5  			 0.87
auc_warp-kos_0.05_10  			 0.89
auc_warp-kos_0.05_20  			 0.90
auc_warp-kos_0.1_5  			 0.87
auc_warp-kos_0.1_10  			 0.89
auc_warp-kos_0.1_20  			 0.90
auc_warp-kos_0.2_5  			 0.74
auc_warp-kos_0.2_10  			 0.80
auc_warp-kos_0.2_20  			 0.83
auc_bpr_0.01_5  			 0.59
auc_bpr_0.01_10  			 0.61
auc_bpr_0.01_20  			 0.71
auc_bpr_0.05_5  			 0.73
auc_bpr_0.05_10  			 0.85
auc_bpr_0.05_20  			 0.91
auc_bpr_0.1_5  			 0.85
auc_bpr_0.1_10  			 0.92
auc_bpr_0.1_20  			 0.95
auc_bpr_0.2_5  			 0.91
auc_bpr_0.2_10  			 0.96
auc_bpr_0.2_20  			 0.97


precision_bpr_0.1_20  		 0.44

recall_bpr_0.1_20  			 0.11

auc_warp_0.1_20  			 0.99