In [16]:
import os
import pandas as pd
import numpy as np
import datetime
import scipy.sparse as sparse
import random
import implicit
from sklearn import metrics

In [2]:
os.getcwd()
os.chdir('D:\\Columbia University\\Personalization\\Project\\lastfm-dataset-1K')

In [3]:
users = pd.read_table('userid-profile.tsv', header=0)
data = pd.read_table('userid-timestamp-artid-artname-traid-traname.tsv', header=-1, error_bad_lines=False)

b'Skipping line 2120260: expected 6 fields, saw 8\n'
b'Skipping line 2446318: expected 6 fields, saw 8\n'
b'Skipping line 11141081: expected 6 fields, saw 8\n'
b'Skipping line 11152099: expected 6 fields, saw 12\nSkipping line 11152402: expected 6 fields, saw 8\n'
b'Skipping line 11882087: expected 6 fields, saw 8\n'
b'Skipping line 12902539: expected 6 fields, saw 8\nSkipping line 12935044: expected 6 fields, saw 8\n'
b'Skipping line 17589539: expected 6 fields, saw 8\n'


In [4]:
data = data.rename(columns={0:'userid', 1:'timestamp', 2:'artistid', 3:'artistname', 4:'trackid', 5:'trackname'})
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [5]:
data = data.groupby(['userid', 'artistname']).size().reset_index(name='plays')

In [6]:
wide_artist_data = data.pivot(index = 'artistname', columns = 'userid', values = 'plays').fillna(0)
wide_artist_data_sparse = sparse.csr_matrix(wide_artist_data.values)

In [7]:
wide_artist_data_sparse

<173921x992 sparse matrix of type '<class 'numpy.float64'>'
	with 897419 stored elements in Compressed Sparse Row format>

The procedure below separates the original data set into a test set, which is a copy of the original ratings data set, and a training set with 20% of non-zero entries masked.

The "ratings" parameter takes in the original data set in the form of a sparse csr_matrix.  The "pct_test" gives us control over the percentage of user-item interactions being masked.

The procedure outputs a training set, a test set and an array of user ids that were masked in the training data.

In [10]:
def make_train(ratings, pct_test = 0.2):
    
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = np.nonzero(training_set) #training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

In [11]:
training_set, test_set, masked_user_ids = make_train(wide_artist_data_sparse, pct_test = 0.2)

In [18]:
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((training_set*alpha).astype('double'), 
                                                            factors=20, regularization = 0.1, iterations = 50)

This method is deprecated. Please use the AlternatingLeastSquares class instead


The function below calculates the area under the Receiver Operating Characteristic curve.  It calculate AUC for a given user and is implemented as a helper function in the procedure below

In [20]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    
    return metrics.auc(fpr, tpr)

In [22]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
    '''
    This function will calculate the mean AUC by user for any user that had their user-item matrix altered. 
    
    parameters:
    
    training_set - The training set resulting from make_train, where a certain percentage of the original
    user/item interactions are reset to zero to hide them from the model 
    
    predictions - The matrix of your predicted ratings for each user/item pair as output from the implicit MF.
    These should be stored in a list, with user vectors as item zero and item vectors as item one. 
    
    altered_users - The indices of the users where at least one user/item pair was altered from make_train function
    
    test_set - The test set constucted earlier from make_train function
    
    
    
    returns:
    
    The mean AUC (area under the Receiver Operator Characteristic curve) of the test set only on user-item interactions
    there were originally zero to test ranking ability in addition to the most popular items as a benchmark.
    '''
    
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    
    for user in altered_users: # Iterate through each user that had an item altered
        training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
        zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
   # Return the mean AUC rounded to three decimal places for both test and popularity benchmark

The second measure is a popularity AUC that is based on a model that recommends the most popular items.

In [25]:
calc_mean_auc(training_set, masked_user_ids, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], test_set)

(0.723, 0.784)