## Read dataset and ICM

In [2]:
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing

def readTrainingSet():
    file = open('../data/train_final.csv', 'r')
    ds = []
    next(file)

    for line in file:
        split = line.split("\t")
        split[1] = split[1].replace("\n","")
        split[0] = int(split[0])
        split[1] = int(split[1])
        ds.append(tuple(split))

    file.close()
    return ds

def readItemInfo():
    file = open('../data/tracks_final.csv', 'r')
    ds = []
    missing =[]
    next(file)

    for line in file:
        split = line.split("\t")
        split[5] = split[5].replace("\n","")
        split[5] = split[5].replace("[","")
        split[5] = split[5].replace("]","")
        split[5] = split[5].replace(",","")
        tags = split[5].split()

        if len(tags) == 0:
            missing.append(int(split[0]))
        else:
            for tag in tags:
                ds.append((int(split[0]), int(tag)))

    file.close()

    songWithTags, tags = zip(*ds)
    songWithTags = list(songWithTags)
    tags = list(tags)

    # Preprocess songs
    les = preprocessing.LabelEncoder()
    allSongs = songWithTags + missing
    les.fit(allSongs)
    songWithTags = les.transform(songWithTags)

    # Preprocess tags
    le = preprocessing.LabelEncoder()
    le.fit(tags)
    tags = le.transform(tags)
    indices = np.ones(len(tags))

    print('creating ICM')
    ICM = sps.coo_matrix((indices, (songWithTags, tags))).tocsc()

    print(ICM.shape)
    missingItems = np.zeros((1, len(set(tags))))
    missingItems = sps.csc_matrix(missingItems)
    ICM = sps.vstack((ICM, missingItems)).tocsc()
    print(ICM.shape)

    # return ICM.tocsr()
    # return ICM.tocsc(), les
    return ICM, les


def readTargetPlaylists():
    return np.genfromtxt('../data/target_playlists.csv', delimiter='\t', skip_header=1, dtype=int)


def splitTestTrainDS(ds, les):
    playlists, songs = zip(*ds)
    ps = np.array(playlists)
    s = list(songs)
    s = np.array(les.transform(s))
    rating = np.ones(s.size)

    train_split = 0.8
    num_interactions = len(ds)

    mask = np.random.choice([True, False], num_interactions, p=[train_split, 1-train_split])
    trainds = sps.coo_matrix((rating[mask], (ps[mask], s[mask]))).tocsr()
    mask = np.logical_not(mask)
    testds = sps.coo_matrix((rating[mask], (ps[mask], s[mask]))).tocsr()

    return trainds, testds


def readData():
    ICM, les = readItemInfo()
    ds = readTrainingSet()

    trainingSet, testSet = splitTestTrainDS(ds, les)

    playlists, _ = zip(*ds)
    targets = list(set(playlists))

    return trainingSet, testSet, targets, ICM, les


print('Read dataset')

trainds, testds, targets, ICM, les = readData()
ICM

Read dataset
creating ICM
(99999, 31900)
(100000, 31900)


<100000x31900 sparse matrix of type '<class 'numpy.float64'>'
	with 483482 stored elements in Compressed Sparse Column format>

## Fit model

In [11]:
import scipy

def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

class ISimilarity(object):
    """Abstract interface for the similarity metrics"""

    def __init__(self, shrinkage=10):
        self.shrinkage = shrinkage

    def compute(self, X):
        pass


class Cosine(ISimilarity):
    def compute(self, X):
        # convert to csc matrix for faster column-wise operations
        X = check_matrix(X, 'csc', dtype=np.float32)

        # 1) normalize the columns in X
        # compute the column-wise norm
        # NOTE: this is slightly inefficient. We must copy X to compute the column norms.
        # A faster solution is to  normalize the matrix inplace with a Cython function.
        Xsq = X.copy()
        Xsq.data **= 2
        norm = np.sqrt(Xsq.sum(axis=0))
        norm = np.asarray(norm).ravel()
        norm += 1e-6
        # compute the number of non-zeros in each column
        # NOTE: this works only if X is instance of sparse.csc_matrix
        col_nnz = np.diff(X.indptr)
        # then normalize the values in each column
        X.data /= np.repeat(norm, col_nnz)
        print("Normalized")

        # 2) compute the cosine similarity using the dot-product
        dist = X * X.T
        print("Computed")

        # zero out diagonal values
        dist = dist - sps.dia_matrix((dist.diagonal()[scipy.newaxis, :], [0]), shape=dist.shape)
        print("Removed diagonal")

        # and apply the shrinkage
        if self.shrinkage > 0:
            dist = self.apply_shrinkage(X, dist)
            print("Applied shrinkage")

        print('done i compute')

        return dist

    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[scipy.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist


class BasicItemKNNRecommender(object):
    """ ItemKNN recommender with cosine similarity and no shrinkage"""

    def __init__(self, URM, k=50, shrinkage=15, similarity='cosine'):
        self.dataset = URM
        self.k = k
        self.shrinkage = shrinkage
        self.similarity_name = similarity
        if similarity == 'cosine':
            self.distance = Cosine(shrinkage=self.shrinkage)
        # elif similarity == 'pearson':
        #     self.distance = Pearson(shrinkage=self.shrinkage)
        # elif similarity == 'adj-cosine':
        #     self.distance = AdjustedCosine(shrinkage=self.shrinkage)
        else:
            raise NotImplementedError('Distance {} not implemented'.format(similarity))

    def __str__(self):
        return "ItemKNN(similarity={},k={},shrinkage={})".format(
            self.similarity_name, self.k, self.shrinkage)

    def fit(self, X):
        item_weights = self.distance.compute(X)
        print('dist done')
        item_weights = check_matrix(item_weights, 'csr') # nearly 10 times faster
        print("Converted to csr")

        # for each column, keep only the top-k scored items
        # THIS IS THE SLOW PART, FIND A BETTER SOLUTION
        values, rows, cols = [], [], []
        nitems = self.dataset.shape[1]
        for i in range(nitems):
            if (i % 10000 == 0):
                print("Item %d of %d" % (i, nitems))

            this_item_weights = item_weights[i,:].toarray()[0]
            top_k_idx = np.argsort(this_item_weights) [-self.k:]

            values.extend(this_item_weights[top_k_idx])
            rows.extend(np.arange(nitems)[top_k_idx])
            cols.extend(np.ones(self.k) * i)
        self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

    def recommend(self, user_id, at=5, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.dataset[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        # rank items
        ranking = scores.argsort()[::-1]
        if exclude_seen:
            ranking = self._filter_seen(user_id, ranking)

        return ranking[:at]

    def _filter_seen(self, user_id, ranking):
        user_profile = self.dataset[user_id]
        seen = user_profile.indices
        unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
        return ranking[unseen_mask]
    
    
recommender = BasicItemKNNRecommender(trainds)
print('Fit model')
recommender.fit(ICM)

Fit model
Normalized
Computed
Removed diagonal
done i compute
dist done
Converted to csr
Item 0 of 100000
Item 1000 of 100000
Item 2000 of 100000
Item 3000 of 100000
Item 4000 of 100000
Item 5000 of 100000
Item 6000 of 100000
Item 7000 of 100000
Item 8000 of 100000
Item 9000 of 100000
Item 10000 of 100000
Item 11000 of 100000
Item 12000 of 100000
Item 13000 of 100000
Item 14000 of 100000
Item 15000 of 100000
Item 16000 of 100000
Item 17000 of 100000
Item 18000 of 100000
Item 19000 of 100000
Item 20000 of 100000
Item 21000 of 100000
Item 22000 of 100000
Item 23000 of 100000
Item 24000 of 100000
Item 25000 of 100000
Item 26000 of 100000
Item 27000 of 100000
Item 28000 of 100000
Item 29000 of 100000
Item 30000 of 100000
Item 31000 of 100000
Item 32000 of 100000
Item 33000 of 100000
Item 34000 of 100000
Item 35000 of 100000
Item 36000 of 100000
Item 37000 of 100000
Item 38000 of 100000
Item 39000 of 100000
Item 40000 of 100000
Item 41000 of 100000
Item 42000 of 100000
Item 43000 of 100000


## eval

In [1]:
import numpy as np
import scipy as sp
import scipy.sparse as sps
import matplotlib.pyplot as plt
import seaborn
import sys
import time

def precision(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score


def recall(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score


def MAP(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score


def evaluate(ds, targets, recommender):
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0

    num_eval = 0

    numTargets = len(targets)

    for t in targets:
        if num_eval % 100 == 0:
            print('targets done: ', num_eval, numTargets)

        rel_songs = ds[t].indices

        if len(rel_songs) > 0:
            rec_songs = recommender.recommend(t)
            num_eval += 1
            cumulative_precision += precision(rec_songs, rel_songs)
            cumulative_recall += recall(rec_songs, rel_songs)
            cumulative_MAP += MAP(rec_songs, rel_songs)

    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval

    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
    cumulative_precision, cumulative_recall, cumulative_MAP))

        
print('Evaluate recommender')
evaluate(testds, targets, recommender)


Evaluate recommender


NameError: name 'testds' is not defined

## Predict

In [None]:
def predict(targets, recommender, les):
    recommendations = []
    i = 0
    numTargets = len(targets)
    for t in targets:
        if i % 1000 == 0:
            print('targets done: ', i, numTargets)
        rec = recommender.recommend(t)
        res = les.inverse_transform(rec)
        recommendations.append(' '.join(str(p) for p in res))
        i += 1

    result = list(zip(targets, recommendations))
    t = int(time.time())
    np.savetxt('../result/'+ str(t) +'.csv', result, fmt="%s", delimiter=',', header='playlist_id,track_ids')
    return recommendations

def readTargetPlaylists():
    return np.genfromtxt('../data/target_playlists.csv', delimiter='\t', skip_header=1, dtype=int)

def recommend(recommender, les):
    t = readTargetPlaylists()
    predict(t, recommender, les)

print('Recommend')
recommend(recommender, les)
print('Done')