### Load Libraries

In [1]:
import numpy as np
import pandas as pd
import random
import csv

import timeit
from tqdm import tqdm

In [2]:
from model import mpd_ndb, model_popularity
from model import evaluate_model

### Initialize NDB object to build Base Model - Popular Tracks

In [3]:
ndb = mpd_ndb.spotify_mpd()

In [4]:
ndb.get_data(True)

Tracks summary: There are 1124914 tracks from 172112 artists
Track Dictionaries summary: There are 1124914 unique tracks in the dictionaries per tid and spid
   pid  tid  rate
0    0    0     1
1    0    2     1
2    0    3     1
3    0    4     1
4    0    5     1
Train summary: There are 171069 playlist, 170068 tracks and 12864850 rates in the train set
   pid  tid  rate
0    0    1     1
1    0    7     1
2    0    8     1
3    0   11     1
4    0   14     1
Test summary: There are 171069 playlist, 167581 tracks and 5513508 rates in the test set
(171069, 170117)
Train Matrix summary: 171069x170117 sparse matrix of dtype int8 with 12864850 stored elements in Compressed Sparse Row format
(171069, 170117)
Test Matrix summary: 171069x170117 sparse matrix of dtype int8 with 5513508 stored elements in Compressed Sparse Row format


### Initialize Popularity Model

In [5]:
# Define input variables

name_base = 'popularity'
train = ndb.train
#X = random.randint(0, train.pid.nunique()-1)   # Select Random Playlist ID
topk = 50

In [6]:
# Most rated - Popular tracks
model_base = model_popularity.base_model(name_base, train)
model_base.mostRated()
model_base.model.head()

tid
361     10781
1755     9921
1579     9471
586      9361
7278     9276
Name: pid, dtype: int64

### Predict Topk tracks using Popularity Model

In [7]:
# PID - Playlist ID - keep
X = 157638

In [8]:
# Get titles from random Playlist - Train

[(x, ndb.get_title(x)) for x in train[train['pid']==X].tid.to_list()]

[(197, 'Break Apart - Bonobo'),
 (282, 'Goshen - Beirut'),
 (302, 'Pale Blue Eyes - The Velvet Underground'),
 (2938, 'Lua - Bright Eyes'),
 (3162, 'The Less I Know The Better - Tame Impala'),
 (4239, 'Heartbeats - José González'),
 (4424, '3 Rounds and a Sound - Blind Pilot'),
 (4722, 'Chamber Of Reflection - Mac Demarco'),
 (4938, 'Warm On A Cold Night - HONNE'),
 (5787, 'Cranes in the Sky - Solange'),
 (6170, 'River - Leon Bridges'),
 (6873, 'Between The Bars - Elliott Smith'),
 (6876, 'Timshel - Mumford & Sons'),
 (6885, 'Home - Bruno Major'),
 (6918,
  'Wicked Game - Live at Killkenny Arts Festival, Ireland/2011 - James Vincent McMorrow'),
 (7839,
  'This Night Has Opened My Eyes - 2011 Remastered Version - The Smiths'),
 (8081, 'On The Level - Mac Demarco'),
 (8539, 'Space Song - Beach House'),
 (10066,
  'Please, Please, Please Let Me Get What I Want - 2011 Remastered Version - The Smiths'),
 (11666, 'Song For Zula - Phosphorescent'),
 (12425, 'Eyes - Rogue Wave'),
 (12440, 'If 

In [9]:
recommends_base = model_base.predict(X, topk)

In [10]:
# Get titles from Recommended tracks using Base Model - Popularity - Most Rated

recommends_base_rank = [[ndb.get_title(x), x] for x in recommends_base]

In [11]:
df_popularity_recs = pd.DataFrame.from_records(recommends_base_rank, 
                               columns=['title', 'tid'],
                               index=pd.RangeIndex(start=1, stop=len(recommends_base)+1, name='rank'))

In [12]:
df_popularity_recs

Unnamed: 0_level_0,title,tid
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Riptide - Vance Joy,361
2,Ophelia - The Lumineers,1755
3,Don't Stop Believin' - Journey,1579
4,Bohemian Rhapsody - Remastered 2011 - Queen,586
5,Midnight City - M83,7278
6,Ho Hey - The Lumineers,55
7,"September - Earth, Wind & Fire",1078
8,Little Talks - Of Monsters and Men,1889
9,Mr. Brightside - The Killers,4055
10,Brown Eyed Girl - Van Morrison,621


### Evaluate Popularity Model

In [13]:
# Run this cell only once, to create score file, it deletes everything with any new execution

with open('./results/model_scores.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['model_type', 'similarity_option', 'k', 'topk', 'recall', 'map'])

In [14]:
# Calculate Predictions for Base Model at Topk

list_of_topks = [5, 10, 25, 50, 100]

trainPlaylists = ndb.train.groupby('pid')['tid'].apply(list).reset_index().sort_index()
train_ids = trainPlaylists.tid.values

testPlaylists = ndb.test.groupby('pid')['tid'].apply(list).reset_index().sort_index()
test_ids = testPlaylists.tid.values

In [15]:
base_predictions = trainPlaylists.tid.apply(lambda x: model_base.predict(x, np.max(topk))).reset_index()
base_predictions_ids = base_predictions.tid.values

In [16]:
# Calulate metrics: Precision and Recall at k (5, 10, 25, 50, 100)  for each playlist

start = timeit.default_timer()

evaluation = evaluate_model.Metrics(train_ids[:1000], test_ids[:1000], base_predictions_ids[:1000])
base_results_recall = evaluation.recall_at_n(list_of_topks, True)
base_results_map = evaluation.map_at_n(list_of_topks, True)

stop = timeit.default_timer()
print(f'Calculate Popularity model results for subset_mpdNDB dataset for {len(train_ids)} Playlists', stop - start)

recall@5=0.025
recall@10=0.023
recall@25=0.020
recall@50=0.027
recall@100=0.027
map@5=0.015
map@10=0.010
map@25=0.005
map@50=0.005
map@100=0.005
Calculate Popularity model results for subset_mpdNDB dataset for 171069 Playlists 9.457629299999994


In [17]:
for i, topk in enumerate(list_of_topks):
    with open('./results/model_scores.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow(
            [model_base.model_name, '-', 0,str(topk) , 
             str("%.3f" %(base_results_recall[i])), str("%.3f" %(base_results_map[i]))])  