In [None]:
import pandas as pd
import dask.dataframe as dd
import tarfile
from dask.delayed import delayed
import scipy
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import surprise

from surprise import Dataset, Reader
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import accuracy

from sklearn.manifold import TSNE

from sklearn.neighbors import DistanceMetric

from sklearn.decomposition import PCA

from sklearn.neighbors import NearestNeighbors

## Reading Data

In [None]:
famous_tracks = pd.read_csv('data/features.csv')
df_1kfamous   = pd.read_csv('data/df_1kfamous.csv')

## SVD Using Surprise library


Setup dataframe to only take the famous tracks computed before

Let's check how many unique users and tracks we have !

In [None]:
len(df_1kfamous['user-id'].unique()), len(df_1kfamous['track-id'].unique())

Compute scoring for each track id based on mean and standard deviation of users plays

In [None]:
# compute mean and std for normalization
user_means = df_1kfamous.groupby('user-id').mean()['plays']
user_std   = df_1kfamous.groupby('user-id').std(ddof=0)['plays'].replace(0, 1)

# normalize plays
df_1kfamous['norm_plays'] = df_1kfamous.apply(lambda x : (x['plays'] - user_means.loc[x['user-id']])/(user_std.loc[x['user-id']]), axis=1)

# linear binning on the number of plays
df_1kfamous['cat_plays'] = pd.cut(df_1kfamous['norm_plays'], bins=10, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# quartile binning
df_1kfamous['qcat_plays'] = pd.qcut(df_1kfamous['norm_plays'], 10, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
df_1kfamous['qcat_plays'].astype(int).plot(kind='hist')

In [None]:
df_1kfamous['qcat_plays'] = df_1kfamous['qcat_plays'].astype(int)
mean_per_track = df_1kfamous.groupby('track-id').mean()['qcat_plays']

In [None]:
df_1kfamous['test_plays'] = df_1kfamous.apply(lambda x : x['qcat_plays'] - mean_per_track.loc[x['track-id']], axis=1)

In [None]:
df_1kfamous['test_plays'].astype(int).plot(kind='hist')

In [None]:
# linear binning on the number of plays
df_1kfamous['new_plays'] = pd.cut(df_1kfamous['test_plays'], bins=10, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
df_1kfamous['new_plays'].astype(int).plot(kind='hist')

In [None]:
df_1kfamous['new_plays'] = df_1kfamous['new_plays'].astype(int)

## Removing Users with low number of listens

In [None]:
df_1kfamous['user_count'] = df_1kfamous.groupby('user-id').transform(lambda x : x.count())['plays']

df_reduced = df_1kfamous[df_1kfamous['user_count'] > 10]

## Setup surprise datastructures

In [None]:
#Initialize Reader class
# Our rating scale is from 1 to 10
reader = Reader(rating_scale=(1, 10))

# now we apply the binning
ndata = Dataset.load_from_df(df_reduced[['user-id', 'track-id', 'new_plays']], reader)

# We'll split into the trainset and testset
trainset, testset = surprise.model_selection.train_test_split(ndata, test_size=.25)

Computing the SVD and obtaining the RMSE

In [None]:
%%time

full_set = ndata.build_full_trainset()

final_algorithm = SVD(n_factors=1000, n_epochs=20, biased=True)
final_algorithm.fit(full_set)

# And we test it
#test_predictions = final_algorithm.test(testset)

# Get the accuracy
#print(f"The RMSE is {accuracy.rmse(test_predictions)}")

In [None]:
algo = final_algorithm

In [None]:
def compute_precision(user_id, predict_df, tracks_df=famous_tracks, plays_df=df_1kfamous, k=10):
    
    predicted_tracks = list(predict_df.sort_values(by='prediction', ascending=False).head(k)[0])
    
    predicted_artists = set(tracks_df[tracks_df['musicbrainz-track-id'].isin(predicted_tracks)]['musicbrainz-artist-id'])
    
    listened_artists = set(plays_df[plays_df['user-id'] == user_id]['artist-id'])
    
    return len(predicted_artists.intersection(listened_artists))/len(predicted_artists)

def compute_map(user_id, predict_df,tracks_df=famous_tracks, plays_df=df_1kfamous, k=10):
    
    precisions = [compute_precision(user_id, predict_df, tracks_df=famous_tracks, plays_df=df_1kfamous, k=k_) for k_ in np.arange(k)+1]
    
    return np.mean(precisions)

def compute_average_precision_map(df,algo,k=10):
    precisions = []
    maps = []
    all_tracks = df['track-id'].unique()
    
    for user in tqdm(df['user-id'].unique()):

        # compute predictions
        predicts = [algo.predict(uid=user, iid=x).est for x in all_tracks]

        # create df of tracks
        predicts_df = pd.Series(all_tracks).to_frame()

        # add predictions to previous df
        predicts_df['prediction'] = predicts
        
        # compute values
        precisions.append(compute_precision(user, predicts_df,k=k))
        maps.append(compute_map(user, predicts_df,k=k))
        
    return (np.mean(precisions), np.std(precisions)), (np.mean(maps), np.std(maps))

# GRIIIIIID SEAAAAARCH

In [None]:
from surprise.model_selection import KFold
from surprise.prediction_algorithms.knns import KNNBaseline
from surprise.prediction_algorithms.co_clustering import CoClustering
import pickle 
import seaborn as sns

def launch_grid_search(algos, algo_names):
    kf = KFold(n_splits=3)
    results = {}

    for algo, name in tqdm(zip(algos, algo_names),total=len(algos)):

        results[name] = {'rmse':[], 'mae':[], 'mean_precision@k':None, 'mean_map@k': None, 'std_precision@k':None, 'std_map@k':None}

        for trainset, testset in kf.split(data):

            # train and test algorithm.
            algo.fit(trainset)
            predictions = algo.test(testset)

            # Compute RMSE, MAE
            results[name]['rmse'].append(accuracy.rmse(predictions, verbose=False))
            results[name]['mae'].append(accuracy.mae(predictions, verbose=False))

            if results[name]['mean_precision@k'] is None:
                # Compute Rank based metrics
                (mean_precision, std_precision), (mean_map, std_map) = compute_average_precision_map(df_1kfamous,algo)
                results[name]['mean_precision@k'] = mean_precision
                results[name]['mean_map@k']       = mean_map
                results[name]['std_precision@k']  = std_precision
                results[name]['std_map@k']        = std_map

    return results


algos = [SVD(),
         SVD(biased=False),
         NMF(),
         KNNBaseline(sim_options={'user_based':True, 'name':'pearson_baseline'}),
         KNNBaseline(sim_options={'user_based':False, 'name':'pearson_baseline'}),
         CoClustering(n_cltr_u=5, n_cltr_i=20)
        ]

algo_names = ['SVD', 'PMF', 'NMF', 'User-based KNN with Baseline', 'Item-based KNN with Baseline']

# results = launch_grid_search(algos, algo_names)
# f = open('gridresults.pkl', 'wb')   # Pickle file is newly created where foo1.py is
# pickle.dump(results, f)          # dump data to f
# f.close()  

## Reading results

In [None]:
with open('gridresults.pkl', 'rb') as handle:
    results = pickle.load(handle)

results_df = pd.DataFrame(results).applymap(lambda x : np.mean(x))
results_df

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(10,5))

results_df.loc[['mean_map@k']].T.plot.bar(yerr=results_df.loc['std_map@k'],ax=axs[0])

results_df.loc[['mean_precision@k']].T.plot.bar(yerr=results_df.loc['std_precision@k'], ax=axs[1])

In [None]:
for col in results_df.index:
    plt.figure(figsize=(15,5))
    sns.barplot(data=results_df.loc[col].to_frame().T)
    plt.gca().set(title=col)
    plt.show()

## Testing on sampled users

In [None]:
# sample user
sampled_user = df_1kfamous['user-id'].sample(n=1).iloc[0]

# compute predictions
predicts = [final_algorithm.predict(uid=sampled_user, iid=x).est for x in df_1kfamous['track-id'].unique()]

# create df of tracks
predicts_df = pd.Series(df_1kfamous['track-id'].unique()).to_frame()

# add predictions to previous df
predicts_df['prediction'] = predicts

# get best songs predicted from svd
predicted_best = predicts_df.sort_values(by='prediction', ascending=False).head(5)[0]

# get the best songs by track
single_user = df_1kfamous[df_1kfamous['user-id'] == sampled_user]
target_best = single_user.sort_values(by='plays',ascending=False).head(5)['track-id']
print('Target')
print(famous_tracks[famous_tracks['musicbrainz-track-id'].isin(target_best)][['track-name','artist-name']].to_markdown())
print('Predicted')
print(famous_tracks[famous_tracks['musicbrainz-track-id'].isin(predicted_best)][['track-name','artist-name']].to_markdown())

## Finding best tracks for a random user

In [None]:
# get the best songs by track
single_user = df_1kfamous[df_1kfamous['user-id'] == 'user_000016']
target_best = single_user.sort_values(by='plays',ascending=False).head(5)['track-id']

In [None]:
# compute predictions
predicts = [final_algorithm.predict(uid='user_000016', iid=x)[3] for x in df_1kfamous['track-id'].unique()]

# create df of tracks
predicts_df = pd.Series(df_1kfamous['track-id'].unique()).to_frame()

# add predictions to previous df
predicts_df['prediction'] = predicts

# get best songs predicted from svd
predicted_best = predicts_df.sort_values(by='prediction', ascending=False).head(5)[0]

**Actual most listened tracks**

In [None]:
df_1kfamous[df_1kfamous['track-id'].isin(target_best)][['track-name','artist-name']].drop_duplicates()

**Best tracks retrieved by SVD**

In [None]:
df_1kfamous[df_1kfamous['track-id'].isin(predicted_best)][['track-name','artist-name']].drop_duplicates()

## Full-Set SVD

In [None]:
final_algorithm = SVDpp(n_factors=100, n_epochs=20, reg_all=0.05)
final_algorithm.fit(data.build_full_trainset())

## User clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

from yellowbrick.cluster import KElbowVisualizer

pca = PCA(n_components=2)
x2d = pca.fit_transform(final_algorithm.pu)

model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,12))

visualizer.fit(x2d)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x2d = pca.fit_transform(final_algorithm.pu)

model = KMeans(n_clusters=5)
model.fit(x2d)

## Visualizing our clustering

In [None]:
userids = [full_set.to_raw_uid(uid) for uid in range(final_algorithm.pu.shape[0])]

userlabels = pd.DataFrame({'userid':userids,'label':model.labels_, 'x': x2d[:,0], 'y':x2d[:,1]} )

fig = px.scatter(userlabels,'x','y', color='label', custom_data=['userid'])
fig.update_traces(
    hovertemplate="<br>".join([
        "x: %{x}",
        "y: %{y}",
        "user-id: %{customdata[0]}"
    ])
)

Number of users per cluster :

In [None]:
np.bincount(model.labels_)

### Aggregation techniques

In [None]:
def sample_k_users(df, k, num_clusters=5):
    
    # select one from each cluster
    num_per = np.bincount(list(map(lambda x : x%num_clusters, np.arange(k))))
    
    # sample user from number
    return sum([list(df[df['label'] == i]['userid'].sample(n=x).values) for i,x in enumerate(num_per)], [])
    
users = sample_k_users(userlabels, 5)

In [None]:
def predictions_from_users(df, users):
    # get list of all tracks
    tracks = df['track-id'].unique()
    
    # compute prediction for all users, all tracks
    predicts = [[final_algorithm.predict(uid=uid, iid=iid).est for uid in users] for iid in tracks]
    
    # create df from results
    predicts_df = pd.DataFrame(predicts)
    
    # add information about track, user
    predicts_df.index = tracks
    predicts_df.columns = users
    
    return predicts_df

In [None]:
def disagreement_variance(predicts_df):
    # init value
    values = np.zeros(predicts_df.shape[0])
    
    # iterate over all pairs of users
    for col1 in predicts_df.columns:
        for col2 in predicts_df.columns:
            if col1 != col2:
                # add difference
                values += np.abs(predicts_df[col1] - predicts_df[col2])
                
    return values * 2/(predicts_df.shape[1] * (predicts_df.shape[1] - 1))

In [None]:
def compute_group_ratings(predicts_df, relevance_coeff = 0.5, max_rating=10):
    # compute relevance
    average_relevance = predicts_df.mean(axis=1).to_frame('relevance') / max_rating
    # compute variance
    variance = disagreement_variance(predicts_df).to_frame('variance')
    # join back variance and relevance in a single rating
    group_ratings = average_relevance.join(variance)
    group_ratings['rating'] = (relevance_coeff*group_ratings['relevance']) + (1-relevance_coeff)*(1-group_ratings['variance'])
    return group_ratings

## Effect of number of users on the relevance (when sampling as many from each cluster)

In [None]:
uniform_sampler       = lambda df,k : list(df['userid'].sample(n=k))
first_cluster_sampler = lambda df,k : list(df[df['label'] == 0]['userid'].sample(n=k))

samplers      = [sample_k_users, uniform_sampler, first_cluster_sampler]
sampler_names = ['Uniform per cluster sampler', 'Uniform Sampler', 'Single cluster sampler']

In [None]:
mean_ratings = {}

all_ks = np.unique(np.geomspace(start=5, stop=100, num=10, dtype=int))
for sampler, sampler_name in zip(samplers, sampler_names):
    
    mean_ratings[sampler_name] = []
    
    for k in tqdm(all_ks):
        # sample k users from the clusters
        users = sampler(userlabels,k)

        # compute predictions for the given users
        predicts_df = predictions_from_users(df_reduced, users)

        # compute group ratings
        group_ratings = compute_group_ratings(predicts_df)

        # mean rating for top_k
        mean_ratings[sampler_name].append(group_ratings.sort_values(by='rating', ascending=False).head(10)['rating'].mean())

In [None]:
pd.DataFrame(mean_ratings).set_index(all_ks).plot(kind='bar')
plt.gca().set(title=f'Relevance (in [0,1]) as a function of group size', xlabel='Group Size', ylabel='Relevance')

## Effect of Relevance coefficient (0 => Minimize Disagreement, 1 => Maximize Average relevance)

In [None]:
k = 3
# sample k users from the clusters
users = first_cluster_sampler(userlabels,k)

In [None]:
for user in users:
    display(df_1kfamous[df_1kfamous['user-id'] == user].sort_values(by='plays',ascending=False).head(3)[['track-name', 'artist-name']])

In [None]:
for relevance_coeff in [0,0.5,1]:
    # compute predictions for the given users
    predicts_df = predictions_from_users(df_reduced, users)

    # compute group ratings
    group_ratings = compute_group_ratings(predicts_df, relevance_coeff=relevance_coeff)

    # mean rating for top_k
    top10_ratings = group_ratings.sort_values(by='rating', ascending=False).head(5)
    
    
    display(famous_tracks.merge(top10_ratings.reset_index(), right_on='index', left_on='musicbrainz-track-id')[['track-name', 'artist-name']])

In [None]:
top10_ratings.reset_index()

## Per user top-k from svd

In [None]:
famous_tracks[famous_tracks['musicbrainz-track-id'].isin((list(ratings_df['user_000378'].sort_values(ascending=False).index)[:10]))]

In [None]:
ratings_df['user_000657'].argmax()

In [None]:
ratings_df.index[173]

In [None]:
ratings_df.loc['c697b759-2ef6-43bb-a97a-2c56409abade']

In [None]:
dic1 = famous_tracks.set_index('musicbrainz-track-id').to_dict(orient='index')
dic_trackname = {a:b['track-name'] for a,b in dic1.items()}
dic_artist_id = {a:b['musicbrainz-artist-id'] for a,b in dic1.items()}

In [None]:
gb = ratings_df.T.melt(ignore_index=False).reset_index().groupby('index')
per_user_pref = gb.apply(lambda x : set(x.sort_values(by='value',ascending=False).head(10)['variable']))
user_pref = pd.DataFrame([list(x) for x in list(per_user_pref)]).T
user_pref.columns = user_sample
user_pref

In [None]:
user_artists = df_1kfamous.groupby('user-id').apply(lambda x : set(x['artist-id']))

In [None]:
per_user_listens = {a:b for a,b in zip(user_sample,list(map(lambda x: user_artists.loc[x], user_sample)))}

In [None]:
user_art = user_pref.applymap(dic_artist_id.get)
for column in user_art:
    user_art[column] = user_art[column].isin(per_user_listens[column])

In [None]:
user_pref.applymap(dic_trackname.get)

## Content Base Recommendations 

In [None]:
famous_tracks_rec = pd.read_csv('data/features.csv')

In [None]:
discrete_data = ['key', 'mode', 'time_signature', 'type']
continious_data = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', ]

In [None]:
for c in discrete_data :
    famous_tracks_rec[c] = famous_tracks_rec[c].astype('category').cat.codes

In [None]:
filtered_attrs = famous_tracks_rec.set_index(['musicbrainz-track-id'])[continious_data + discrete_data]

In [None]:
dist = DistanceMetric.get_metric('euclidean')
euclidean_genres_distance = dist.pairwise(filtered_attrs)

In [None]:
euclidean_genres_distance = euclidean_genres_distance / np.max(euclidean_genres_distance)

In [None]:
pd.DataFrame(euclidean_genres_distance)

In [None]:
genres_tracks = famous_tracks_rec['genres']
genres_tracks

The handling of the set of genres was inspired by the following work : https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173

In [None]:
genres_tracks = genres_tracks.apply(lambda x : list(eval(x)))

In [None]:
unique_genres = set()

for g in genres_tracks:
    for i in g :
        unique_genres.add(i)
        
unique_genres

In [None]:
def generate_col_for_genre(tracks, unique_genres) :
    bool_dict = {}
    for i, item in enumerate(unique_genres):
        bool_dict[item] = tracks.apply(lambda x: item in x)
    return pd.DataFrame(bool_dict)
    

In [None]:
genres_bool = generate_col_for_genre(genres_tracks, unique_genres)

In [None]:
genres_bool.astype(int)

In [None]:
dist = DistanceMetric.get_metric('hamming')
hamming_genres_distance = dist.pairwise(genres_bool)

In [None]:
hamming_genres_distance /= np.max(hamming_genres_distance)

In [None]:
pd.DataFrame(hamming_genres_distance)

In [None]:
global_distance = hamming_genres_distance*0.5 + euclidean_genres_distance*0.5

In [None]:
global_distance = pd.DataFrame(global_distance)

In [None]:
def find_n_closest_songs(song, n) :    
    return list(global_distance.nsmallest(n + 1, 0).index)

In [None]:
target_track = famous_tracks_rec.iloc[0]
songs = find_n_closest_songs(target_track, 10)
famous_tracks_rec.loc[songs][['artist-name', 'track-name', 'genres']]

--- 

# Non-conclusive attempts

# Tensorflow recommender

In [None]:
from sklearn.metrics import roc_auc_score
import scipy.sparse as sp
import tensorflow as tf
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import datetime
import time 
import math
import os

### Important information

In [None]:
no_users, no_items, no_ratings = (len(df_1kfamous['user-id'].unique()),
                                  len(df_1kfamous['track-id'].unique()),
                                  len(df_1kfamous.index))

### Rescaling ids

In [None]:
df_1kfamous['original_user_id'] = df_1kfamous['user-id']
df_1kfamous['original_item_id'] = df_1kfamous['track-id']
df_1kfamous['user_id'] = df_1kfamous['user-id'].astype('category').cat.codes
df_1kfamous['item_id'] = df_1kfamous['track-id'].astype('category').cat.codes

### Train test split

In [None]:
from sklearn.model_selection import train_test_split
train_ratings, test_ratings = train_test_split(df_1kfamous,test_size=0.2, random_state=42)

In [None]:
no_users, no_items, no_ratings

In [None]:
train_ratings.shape, train_ratings['user_id'].nunique()

In [None]:
test_ratings.shape, test_ratings['user_id'].nunique()

### Latent factor model

In [None]:
def create_shallow_model(no_factors, no_users, no_items):
    # User branch
    user_id = tf.keras.layers.Input(shape=[1], name='user_id')
    user_matrix = tf.keras.layers.Embedding(no_users+1, no_factors, name='user_matrix')(user_id)
    user_vector = tf.keras.layers.Flatten(name='user_vector')(user_matrix)
    # Item branch
    item_id = tf.keras.layers.Input(shape=[1], name='item_id')
    item_matrix = tf.keras.layers.Embedding(no_items+1, no_factors, name='item_matrix')(item_id)
    item_vector = tf.keras.layers.Flatten(name='item_vector')(item_matrix)
    # Dot product 
    vectors_product = tf.keras.layers.dot([user_vector, item_vector], axes=1, normalize=False)
    # Model definition
    model = tf.keras.models.Model(inputs=[user_id, item_id], outputs=[vectors_product], name='shallow_model')
    return model

In [None]:
def create_deep_model(no_factors, no_users, no_items):
    # User branch
    user_id = tf.keras.layers.Input(shape=[1], name='user_id')
    user_matrix = tf.keras.layers.Embedding(no_users+1, no_factors, name='user_matrix')(user_id)
    user_vector = tf.keras.layers.Flatten(name='user_vector')(user_matrix)
    # Item branch
    item_id = tf.keras.layers.Input(shape=[1], name='item_id')
    item_matrix = tf.keras.layers.Embedding(no_items+1, no_factors, name='item_matrix')(item_id)
    item_vector = tf.keras.layers.Flatten(name='item_vector')(item_matrix)
    # Concantenation
    vectors_concat = tf.keras.layers.Concatenate()([user_vector, item_vector])
    vectors_concat_dropout = tf.keras.layers.Dropout(0.2)(vectors_concat)
    # Backbone 
    dense_1 = tf.keras.layers.Dense(16,name='fc3')(vectors_concat_dropout)
    dropout_1 = tf.keras.layers.Dropout(0.2,name='d3')(dense_1)
    dense_2 = tf.keras.layers.Dense(8,name='fc4', activation='relu')(dropout_1)
    dense_2_output = tf.keras.layers.Dense(1, activation='relu', name='activation')(dense_2)
    # Model definition
    model = tf.keras.models.Model(inputs=[user_id, item_id], outputs=[dense_2_output], name='deep_model')
    return model

In [None]:
no_factors=100
model = create_deep_model(no_factors, no_users, no_items)
model.compile(loss=tf.keras.losses.MeanSquaredError())

model.summary()

In [None]:
X_train = [train_ratings.user_id, train_ratings.item_id]
y_train = train_ratings.qcat_plays
model.fit(X_train, y_train, validation_split=0.2,epochs=20, batch_size=256, shuffle=True)

In [None]:
X_test = [test_ratings.user_id, test_ratings.item_id]
y_test = test_ratings.qcat_plays
y_test_pred = model.predict(X_test, batch_size=2048)

In [None]:
a,b = np.meshgrid(range(no_items),range(no_users))
all_preds = model.predict([np.ravel(b), np.ravel(a)], batch_size=2048)
all_preds = all_preds.reshape((no_users, no_items))

In [None]:
selected_user = 600

predicted_best = all_preds[selected_user].argsort()[::-1][:5]
print(df_1kfamous[df_1kfamous['item_id'].isin(predicted_best)][['artist-name','track-name']].drop_duplicates())
df_1kfamous[df_1kfamous['user_id'] == selected_user].sort_values(ascending=False, by='plays')[['artist-name','track-name']].drop_duplicates().head(5)

In [None]:
from sklearn.metrics import mean_squared_error
print('Test RMSE:', mean_squared_error(y_test.values, y_test_pred, squared=False))

### Evaluation