In [None]:
import numpy as np
import scipy.sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit

In [None]:
import matplotlib.pyplot as plt
%matplotlib nbagg
import seaborn as sns
sns.set(style='darkgrid')

In [None]:
import lightfm

In [None]:
import pandas as pd

In [None]:
import gzip

In [None]:
import sklearn.metrics

In [None]:
from joblib import Parallel, delayed

In [None]:
import copy
def _score(model, user_id, pid_array, row, no_items):
    
    uid_array = np.empty(no_items, dtype=np.int32)
    uid_array.fill(user_id)
    
    #model = copy.deepcopy(model)
    
    #predictions = model.predict(uid_array, pid_array, num_threads=1)
    predictions = model.item_embeddings.dot(model.user_embeddings[user_id]) + model.item_biases
    
    truth = np.asarray(row.todense()).ravel()
        
    mean_ap = sklearn.metrics.average_precision_score(truth, predictions)
    auc = sklearn.metrics.roc_auc_score(truth, predictions)
    
    return mean_ap, auc

def _val(model, user_id, pid_array, train_row, test_row):
    
    no_items = test_row.shape[1]
    uid_array = np.empty(no_items, dtype=np.int32)
    uid_array.fill(user_id)
    
    predictions = model.item_embeddings.dot(model.user_embeddings[user_id]) + model.item_biases
    
    truth = np.asarray(test_row.todense()).ravel()
    
    # Now slice out everything in the training set
    
    idx = np.asarray(train_row.todense().astype(np.bool)).ravel()
    
    predictions = predictions[~idx]
    truth = truth[~idx]
    
    mean_ap = sklearn.metrics.average_precision_score(truth, predictions)
    auc = sklearn.metrics.roc_auc_score(truth, predictions)
    
    return mean_ap, auc


def pscore_model(model, ground_truth, num_threads=4):
    
    ground_truth = ground_truth.tocsr()
    
    no_users, no_items = ground_truth.shape
    
    pid_array = np.arange(no_items, dtype=np.int32)
    
    scores = Parallel(n_jobs=num_threads, verbose=1)(delayed(_score)(model, user_id, pid_array, row, no_items)
                                         for user_id, row in enumerate(ground_truth))
    
    return pd.DataFrame(data=scores, columns=['mean_ap', 'auc'])

def validate_model(model, train, test, num_threads=4):
    
    train = train.tocsr()
    test = test.tocsr()
    
    no_users, no_items = test.shape
    
    pid_array = np.arange(no_items, dtype=np.int32)
    
    scores = Parallel(n_jobs=num_threads, verbose=1)(delayed(_val)(model, user_id, pid_array, train_row, test_row)
                                         for user_id, (train_row, test_row) in enumerate(zip(train, test)))
    
    return pd.DataFrame(data=scores, columns=['mean_ap', 'auc'])

def score_model(model, ground_truth, num_threads=4):
    
    ground_truth = ground_truth.tocsr()
    
    no_users, no_items = ground_truth.shape
    
    pid_array = np.arange(no_items, dtype=np.int32)
    
    mean_ap = []
    auc = []
    
    for user_id, row in enumerate(ground_truth):
        
        uid_array = np.empty(no_items, dtype=np.int32)
        uid_array.fill(user_id)
        
        predictions = model.predict(uid_array, pid_array, num_threads=num_threads)
        
        truth = np.asarray(row.todense()).ravel()
        
        mean_ap.append(sklearn.metrics.average_precision_score(truth, predictions))
        auc.append(sklearn.metrics.roc_auc_score(truth, predictions))
        
    return pd.DataFrame(data={'mean_ap': mean_ap, 'auc': auc})

In [None]:
# Load in the cf data
cfdata_all = pd.read_csv(gzip.open('/home/bmcfee/data/1mil/cf/train_triplets.txt.gz', 'r'), sep='\t', header=None)

cfdata_all.columns = ['user_id', 'song_id', 'count']

In [None]:
bad_songs = pd.read_csv('/home/bmcfee/data/1mil/AdditionalFiles/sid_mismatches.csv', header=None)[0]

In [None]:
# Load the song<->track mapping
unique_tracks = pd.read_csv('/home/bmcfee/data/1mil/AdditionalFiles/unique_tracks.txt', index_col=0, header=None, sep='<SEP>')

unique_tracks.columns = ['song_id', 'artist', 'title']
unique_tracks.index.name = 'track_id'

In [None]:
# Load the artist mapping
unique_artists = pd.read_csv('/home/bmcfee/data/1mil/AdditionalFiles/unique_artists.txt', index_col=0, header=None, sep='<SEP>')

unique_artists.columns = ['mbid', 'track_id', 'artist']
unique_artists.index.name = 'artist_id'

In [None]:
# Load the test split
test_artists = pd.read_csv('/home/bmcfee/data/1mil/AdditionalFiles/artists_test.txt', index_col=0, header=None)
test_artists.index.name = 'artist_id'

In [None]:
test_tracks = test_artists.join(unique_artists)
test_songs = test_tracks.join(unique_tracks, on='track_id', rsuffix=' ', how='inner')

In [None]:
bad_songs = bad_songs.append(test_songs['song_id'])

In [None]:
# Now the songs are only in the msd training split, and not including the mismatches

In [None]:
cfdata_all = cfdata_all[~cfdata_all['song_id'].isin(bad_songs)]

In [None]:
song_counts = cfdata_all.groupby('song_id')['count'].sum()

song_counts.sort_values(inplace=True)

In [None]:
songs = song_counts.tail(10000)

In [None]:
good_records = cfdata_all[cfdata_all['song_id'].isin(songs.index)]

In [None]:
users = good_records.groupby('user_id')['song_id'].count()

users.sort_values(inplace=True)

In [None]:
good_users = users[users >= 100]

In [None]:
good_data = good_records[good_records['user_id'].isin(good_users.index)]

In [None]:
len(good_data['user_id'].unique())

In [None]:
len(good_data['song_id'].unique())

In [None]:
good_data = good_data.reset_index(drop=True)

In [None]:
# Convert to a sparse matrix

In [None]:
ux = LabelEncoder()
ux.fit(good_data['user_id'].unique())

ix = LabelEncoder()
ix.fit(good_data['song_id'].unique())

In [None]:
good_data['user_index'] = ux.transform(good_data['user_id'])
good_data['item_index'] = ix.transform(good_data['song_id'])

In [None]:
# Randomly partition ratings
for train_idx, test_idx in StratifiedShuffleSplit(good_data['user_index'], n_iter=1, test_size=0.2, random_state=0):
    pass

In [None]:
train = scipy.sparse.coo_matrix( (np.ones(len(good_data.loc[train_idx]['count']), dtype=np.int32), 
                                  (good_data.loc[train_idx]['user_index'], good_data.loc[train_idx]['item_index'])))

In [None]:
test = scipy.sparse.coo_matrix( (np.ones(len(good_data.loc[test_idx]['count']), dtype=np.int32), 
                                  (good_data.loc[test_idx]['user_index'], good_data.loc[test_idx]['item_index'])))

In [None]:
scores = []
models = []
for n_components in [8, 16, 32, 64, 128]:
    print("Fitting d={:d}".format(n_components))
    model = lightfm.LightFM(no_components=n_components, loss='warp', item_alpha=1e-6, user_alpha=1e-6)
    model.fit(train, epochs=20, num_threads=3, verbose=False)
    models.append(model)

In [None]:
for model in models:
    print("Evaluating d={:d}".format(model.item_embeddings.shape[1]))
    score_test = validate_model(model, train.tocsr(), test.tocsr())
    print(score_test.mean())
    scores.append(score_test)
    print('---')

---

In [None]:
model = models[-2]

In [None]:
plt.figure()
plt.imshow(model.item_embeddings.T, aspect='auto', interpolation='nearest', cmap='viridis')
plt.tight_layout();

In [None]:
plt.figure()
sns.rugplot(model.item_biases)
plt.tight_layout()

In [None]:
from sklearn.manifold import TSNE

In [None]:
T = TSNE(random_state=0, verbose=1)

In [None]:
Y = T.fit_transform(model.item_embeddings)

In [None]:
plt.figure()
plt.scatter(Y[:, 0], Y[:, 1], alpha=0.3)
plt.tight_layout();