In [42]:
from __future__ import print_function

import logging
import argparse
import time

import numpy
import pandas
from scipy.sparse import coo_matrix
import annoy

from implicit import alternating_least_squares

In [43]:
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    data = pandas.read_table(filename,
                             usecols=[0, 2, 3],
                             names=['user', 'artist', 'plays'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")
    #print (data)

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(float),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))
    #print (plays)

    return data, plays

In [44]:
def bm25_weight(X, K1=100, B=0.8):
    """ Weighs each row of the sparse matrix of the data by BM25 weighting """
    # calculate idf per term (user)
    X = coo_matrix(X)
    N = X.shape[0]
    #print (N)
    idf = numpy.log(float(N) / (1 + numpy.bincount(X.col)))
    #print (idf)
    # calculate length_norm per document (artist)
    row_sums = numpy.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X


In [45]:
class TopRelated(object):
    def __init__(self, artist_factors):
        # fully normalize artist_factors, so can compare with only the dot product
        norms = numpy.linalg.norm(artist_factors, axis=-1)
        self.factors = artist_factors / norms[:, numpy.newaxis]

    def get_related(self, artistid, N=10):
        scores = self.factors.dot(self.factors[artistid])
        best = numpy.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])

In [46]:
class ApproximateTopRelated(object):
    def __init__(self, artist_factors, treecount=20):
        index = annoy.AnnoyIndex(artist_factors.shape[1], 'angular')
        for i, row in enumerate(artist_factors):
            index.add_item(i, row)
        index.build(treecount)
        self.index = index

    def get_related(self, artistid, N=10):
        neighbours = self.index.get_nns_by_item(artistid, N)
        return sorted(((other, 1 - self.index.get_distance(artistid, other))
                      for other in neighbours), key=lambda x: -x[1])

In [55]:
def calculate_similar_artists(input_filename, output_filename,
                              factors=50, regularization=0.01,
                              iterations=15,
                              exact=False, trees=20,
                              use_native=True,
                              dtype=numpy.float64,
                              cg=False):
    print("Calculating similar artists. This might take a while")
    print("reading data from %s", input_filename)
    start = time.time()
    df, plays = read_data(input_filename)
    print("read data file in %s", time.time() - start)
    #print (plays)
    print("weighting matrix by bm25")
    weighted = bm25_weight(plays)
    print (weighted)

    print("calculating factors")
    start = time.time()
    artist_factors, user_factors = alternating_least_squares(weighted,
                                                             factors=factors,
                                                             regularization=regularization,
                                                             iterations=iterations,
                                                             use_native=use_native,
                                                             dtype=dtype,
                                                             use_cg=cg)
    print (artist_factors)
    print (type(artist_factors))
    print (artist_factors.shape)
    print (user_factors)
    print (type(user_factors))
    print (user_factors.shape)
    
    print("calculated factors in %s", time.time() - start)

    # write out artists by popularity
    print("calculating top artists")
    user_count = df.groupby('artist').size()
    artists = dict(enumerate(df['artist'].cat.categories))
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    if exact:
        calc = TopRelated(artist_factors)
    else:
        calc = ApproximateTopRelated(artist_factors, trees)

    print("writing top related to %s", output_filename)
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in calc.get_related(artistid):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))

In [56]:
if __name__ == "__main__":
    inputfile = 'usersha1-artmbid-artname-plays_6000.tsv'
    outputfile = 'out.tsv'

    logging.basicConfig(level=logging.DEBUG)
    calculate_similar_artists(inputfile, outputfile,
                              factors=5,
                              regularization=0.8)

Calculating similar artists. This might take a while
reading data from %s usersha1-artmbid-artname-plays_6000.tsv
read data file in %s 0.0164721012115
weighting matrix by bm25
  (374, 0)	345.03329032
  (885, 0)	333.56395928
  (2072, 0)	341.504020869
  (1041, 0)	339.999361805
  (1667, 0)	336.337380574
  (2556, 0)	176.65213423
  (1977, 0)	337.659591326
  (3059, 0)	248.823345868
  (3223, 0)	310.667340838
  (1950, 0)	334.280651927
  (3503, 0)	300.930499187
  (1858, 0)	332.14236971
  (1284, 0)	292.784551084
  (1433, 0)	332.674603615
  (3079, 0)	331.417654673
  (1516, 0)	330.782934381
  (304, 0)	330.472917944
  (963, 0)	150.33775415
  (137, 0)	329.231525594
  (1834, 0)	236.725791526
  (2700, 0)	258.279331582
  (998, 0)	273.084480529
  (2051, 0)	324.986885094
  (132, 0)	324.802098495
  (1515, 0)	117.886845006
  :	:
  (3099, 123)	40.8354570127
  (34, 123)	140.471651045
  (429, 123)	3.79542020807
  (435, 123)	45.1581971898
  (2828, 123)	52.164724359
  (3182, 123)	20.9045656852
  (3282, 123)	140