In [1]:
import time
import pickle
import pandas as pd
import numpy as np
import scipy.sparse

In [2]:
SUBMISSION = False
SUBMISSION_FILENAME = 'submission.csv'
TEST_FILENAME = 'test.csv'

In [3]:
target_playlists = pd.read_csv('../datasets/target_playlists.csv', sep='\t')
target_tracks = pd.read_csv('../datasets/target_tracks.csv', sep='\t')
tracks_final = pd.read_csv('../datasets/tracks_final.csv', sep='\t')
playlists_final = pd.read_csv('../datasets/playlists_final.csv', sep='\t')
train_final = pd.read_csv('../datasets/train_final.csv', sep='\t')

print('Successfully read data')

Successfully read data


In [4]:
# Playlist and tracks that belong to them
target_playlists_and_tracks = pd.merge(target_playlists, train_final, on='playlist_id')
print('target_playlists_and_tracks {}'.format(target_playlists_and_tracks.shape))
print(target_playlists_and_tracks.head(10))

target_playlists_and_tracks (362661, 2)
   playlist_id  track_id
0     10024884   2879006
1     10024884   1532328
2     10024884   3027673
3     10024884   3236144
4     10024884   1563134
5     10024884    435345
6     10024884    353291
7     10024884    247331
8     10024884    161455
9     10024884   3338954


In [5]:
def split_training_data(train_final, target_playlists_and_tracks, random_state):
    validation_set = target_playlists_and_tracks.groupby(['playlist_id'])\
                        .apply(lambda x: x.sample(n=3, random_state=random_state))\
                        .reset_index(drop=True)
    df_concat = pd.concat([train_final, validation_set])
    training_set = df_concat.drop_duplicates(keep=False)
    return training_set, validation_set

# Split dataset - from all target playlists remove randomly 3 tracks
training_set, validation_set = split_training_data(train_final, target_playlists_and_tracks, random_state=0)
test_target_tracks = validation_set['track_id'].drop_duplicates(keep='first').to_frame()

print('training_set: {} validation_set: {}'.format(training_set.shape, validation_set.shape))
print(training_set.head(5))
print('training_set: {} validation_set: {}'.format(training_set.shape, validation_set.shape))
print(validation_set.head(5))
print('test_target_tracks: {}'.format(test_target_tracks.shape))
print(test_target_tracks.head(5))

training_set: (1010522, 2) validation_set: (30000, 2)
   playlist_id  track_id
0      3271849   2801526
1      5616275    727878
2     11267488   2805283
3     10103900   1515105
4      3836898   2945623
training_set: (1010522, 2) validation_set: (30000, 2)
   playlist_id  track_id
0         7614   2141817
1         7614   3833025
2         7614   3711434
3         7692   2053595
4         7692   2172361
test_target_tracks: (21958, 1)
   track_id
0   2141817
1   3833025
2   3711434
3   2053595
4   2172361


In [6]:
class TopPopularRecommender:
    def __init__(self, training_playlists_and_tracks, target_tracks, n):
        self.training_playlists_and_tracks = training_playlists_and_tracks

        # Count popularity of items in training set
        training_playlists_and_tracks['count'] = training_playlists_and_tracks \
                                                    .groupby(['track_id']) \
                                                    .transform('count')
        print('training_playlists_and_tracks[\'count\'] {}'.format(training_playlists_and_tracks['count'].shape))
        print(training_playlists_and_tracks.head(3))
        
        tracks_with_popularity = training_playlists_and_tracks\
                                    .groupby(['track_id', 'count'])\
                                    .head(1)\
                                    .sort_values('count', ascending=False)
        print('tracks_with_popularity {}'.format(tracks_with_popularity.shape))
        print(tracks_with_popularity.head(3))
        
        # Select target items
        target_tracks_with_popularity = pd.merge(target_tracks, tracks_with_popularity, on='track_id').groupby('track_id').head(1)
        self.target_top_popular_tracks = target_tracks_with_popularity.sort_values('count', ascending=False)        
        print('target_top_popular_tracks {}'.format(self.target_top_popular_tracks.shape))
        print(self.target_top_popular_tracks.head(3))

        self.n = n

    def recommend(self, target_playlists):
        def make_recommendation(playlist):
            # Get all tracks that are on the playlist
            tracks_on_playlist = self.training_playlists_and_tracks.loc[
                self.training_playlists_and_tracks['playlist_id'] == playlist['playlist_id']]

            # Take n + number of playlists from top popular tracks.
            # Then remove all duplications so that tracks that are already on playlist are not selected
            temp = self.target_top_popular_tracks.head(tracks_on_playlist.shape[0] + self.n)
            df_concat = pd.concat([temp, tracks_on_playlist])
            top_n = df_concat.drop_duplicates(subset=['track_id', 'count'], keep=False)

            playlist['recommendation'] = top_n['track_id'].head(self.n).reset_index(drop=True)
            return playlist

        recommended_items = target_playlists.apply(lambda playlist: make_recommendation(playlist), axis=1)
        return recommended_items

In [7]:
print('Building model...')
begin = time.time()
if SUBMISSION:
    recommender = TopPopularRecommender(train_final, target_tracks, n=5)
else:
    recommender = TopPopularRecommender(training_set, test_target_tracks, n=3)
print('Took {0:.{digits}f}s'.format(time.time() - begin, digits=5))

Building model...
training_playlists_and_tracks['count'] (1010522,)
   playlist_id  track_id  count
0      3271849   2801526    140
1      5616275    727878     60
2     11267488   2805283     10
tracks_with_popularity (99992, 3)
      playlist_id  track_id  count
483       3098690   1563309    458
133      10772937   1363985    422
1458      5858765   3705881    418
target_top_popular_tracks (21951, 3)
      track_id  playlist_id  count
263    1563309      3098690    458
9179   1363985     10772937    422
3816   3705881      5858765    418
Took 0.64623s


In [8]:
print('Recommending...')
begin = time.time()
recommended_items = recommender.recommend(target_playlists)
print('Took {0:.{digits}f}s'.format(time.time() - begin, digits=5))

print('recommended_items {}'.format(recommended_items.shape))
print(recommended_items.head(3))

Recommending...
Took 58.73378s
recommended_items (10000, 2)
   playlist_id                                     recommendation
0     10024884  0    1563309
1    1363985
2    3705881
Name: t...
1     10624787  0    1563309
1    1363985
2    3705881
Name: t...
2      4891851  0    1563309
1    1363985
2    3705881
Name: t...


In [9]:
def is_relevant(recommendation_item, validation_set):
    validation_item = validation_set.loc[validation_set['playlist_id'] == recommendation_item['playlist_id']]
    recommendation_item['recommendation'] = recommendation_item['recommendation']\
                                                .isin(list(validation_item['track_id']))
    return recommendation_item


def precision(recommended_items_relevance):
    precision_scores = recommended_items_relevance.sum(axis=1) / recommended_items_relevance.shape[1]
    return precision_scores.mean()


def mAP(recommended_items_relevance):
    p_at_k = recommended_items_relevance.cumsum(axis=1) / (1 + np.arange(recommended_items_relevance.shape[1]))
    recommended_items_mAP = p_at_k.sum(axis=1) / recommended_items_relevance.shape[1]
    return recommended_items_mAP.mean()


def evaluate_recommendations(recommended_items, validation_set):
    items_relevance = recommended_items.apply(lambda recommendation_item: is_relevant(recommendation_item, validation_set), axis=1)
    recommended_items_relevance = pd.DataFrame(list(items_relevance['recommendation']), index=items_relevance['recommendation'].index)
    precision_score = precision(recommended_items_relevance)
    mAP_score = mAP(recommended_items_relevance)
    return precision_score, mAP_score

def evaluate(recommended_items, validation_set):
    print('Evaluating...')
    begin = time.time()
    precision_score, mAP_score = evaluate_recommendations(recommended_items, validation_set)
    print('Precision: {0:.{digits}f}, mAP: {1:.{digits}f}, took {2:.{digits}f}s'
          .format(precision_score, mAP_score, time.time() - begin, digits=5))

In [10]:
if not SUBMISSION:
    evaluate(recommended_items, validation_set)

Evaluating...
Precision: 0.00120, mAP: 0.00148, took 11.52334s
