# CSE 881 MyDJ artist recommender system

### Helper functions

In [15]:
from pathlib import Path
import pandas as pd
import numpy as np


def load_data(num_artists=1000):
    '''
    Loads the data with specified number of random artists included
    '''
    
    path = Path('./lastfm.csv')
    df = pd.read_csv(path.absolute(), sep=',')
    return df

In [17]:
df = load_data()

### Calculate cosine similairity of users

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse


data_items = df.drop('user', 1)

magnitude = np.sqrt(np.square(data_items).sum(axis=1))

# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items = data_items.divide(magnitude, axis='index')

def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

data_matrix = calculate_similarity(data_items)

### Build df that can be fed into Surprise

In [66]:
items_df = pd.DataFrame(columns=['itemID', 'rating', 'userID'])
artist_names = df.columns.values[1:]

item_ids = []
user_ids = []
ratings = []

for i, row in enumerate(df.iterrows()):
    row_vals = row[1]
    vals = list(row_vals)
    user_id = vals.pop(0)
    for i,rating in enumerate(vals):
        item_ids.append(artist_names[i])
        ratings.append(rating)
        user_ids.append(user_id)
print(len(item_ids), len(user_ids), len(ratings))
items_df = pd.DataFrame({'artistName': item_ids, 'userID': user_ids, 'rating': ratings})

349410 349410 349410


In [96]:
from surprise import SVD
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise import Dataset
from surprise import NormalPredictor
from surprise.model_selection import train_test_split

import random
random.seed(1)  # call this before you call split!

print(items_df.shape)

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(items_df[['userID', 'artistName', 'rating']], reader)

algo = KNNBasic()
trainset, testset = train_test_split(data, test_size=.25)
algo.fit(trainset)

# cross_validate(KNNBasic(), data, cv=2)

(349410, 3)
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f5b29b5fe10>

In [101]:
from collections import defaultdict


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

# algo.predict(33, 'adam green', r_ui=1, verbose=True)

15694 ['clueso', 'ac/dc', 'frank sinatra', 'blind guardian', 'justice', 'jay-z', 'iron maiden', 'tori amos', 'system of a down', 'guns n roses']
2186 ['the prodigy', 'clueso', 'air', 'peter fox', 'tom waits', 'die apokalyptischen reiter', 'marilyn manson', 'tori amos', 'norah jones', 'nofx']
3598 ['coldplay', 'ac/dc', 'the kooks', 'rammstein', 'abba', 'linkin park', 'jack johnson', 'arctic monkeys', 'johnny cash', 'alexisonfire']
10114 ['parkway drive', 'opeth', 'air', 'the prodigy', 'in flames', 'seeed', 'scooter', 'groove coverage', 'the streets', 'cypress hill']
14003 ['madonna', 'justin timberlake', 'snow patrol', 'howard shore', 'queens of the stone age', 'alexisonfire', 'black sabbath', 'groove coverage', 'sufjan stevens', 'depeche mode']
2912 ['marilyn manson', 'schandmaul', 'manowar', 'oomph!', 'the beatles', 'motorhead', 'seeed', 'nofx', 'led zeppelin', 'him']
8757 ['rammstein', 'evanescence', 'michael jackson', 'bob marley & the wailers', 'madonna', 'mika', 'billy talent', 's

9666 ['the beatles', 'red hot chili peppers', 'the prodigy', 'nelly furtado', 'die toten hosen', 'metallica', 'mando diao', 'nirvana', 'beatsteaks', 'abba']
10461 ['abba', 'the prodigy', 'boards of canada', 'the beatles', 'beck', 'air', 'foo fighters', 'good charlotte', 'coldplay', 'ramones']
14147 ['beastie boys', 'bob marley', 'michael jackson', 'depeche mode', 'air', 'queen', 'ac/dc', 'pearl jam', 'system of a down', 'aphex twin']
9444 ['johnny cash', 'die toten hosen', 'amy macdonald', 'cypress hill', 'rammstein', 'thievery corporation', 'aphex twin', 'ac/dc', 'scooter', 'norah jones']
9150 ['the prodigy', 'jay-z', 'joy division', 'blink-182', 'aphex twin', 'equilibrium', 'the chemical brothers', 'eluveitie', 'depeche mode', 'portishead']
5592 ['depeche mode', 'the prodigy', 'abba', 'kanye west', 'thievery corporation', 'three days grace', 'schandmaul', 'elvis presley', 'the doors', 'nightwish']
2657 ['the doors', 'snow patrol', 'bob dylan', 'eric clapton', 'pink floyd', 'jimi hend