In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy as sc
from sklearn import preprocessing
from sklearn import model_selection
import functools

from recsys.preprocess import *
from recsys.utility import *

RANDOM_STATE = 2342

%matplotlib inline

In [2]:
train = pd.read_csv('data/train_final.csv', delimiter='\t')
playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')
tracks = tracks[tracks['duration'] != -1]
tracks['tags'] = tracks['tags'].apply(lambda x: np.array(eval(x)))
tracks.index = range(len(tracks))

#target_playlists = pd.read_csv('data/target_playlists.csv', delimiter='\t')
#target_tracks = pd.read_csv('data/target_tracks.csv', delimiter = '\t')

In [3]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size=0.3, min_playlist_tracks=7)

In [4]:
tracks_in_playlist = get_playlist_track_list2(train)

In [None]:
tracks_in_playlist

In [None]:
train

In [None]:
len(train)

In [None]:
tracks.head()

In [None]:
tracks.tail()

In [None]:
len(tracks)

In [None]:
playlists.head()

In [None]:
len(playlists)

In [None]:
train.head()

In [None]:
train.head()

In [None]:
len(train)

In [None]:
test.head()

In [None]:
len(test)

In [None]:
target_playlists

In [None]:
len(target_playlists)

In [None]:
target_tracks

In [None]:
len(target_tracks)

## Most popular

In [5]:
# most popular tracks: the ones we want to recommend
most_popular = get_most_popular_tracks(train)

# we don't want to recommend tracks already in the playlists...
tracks_in_playlist = get_playlist_track_list2(train)

In [None]:
tracks_in_playlist

In [None]:
tracks_in_playlist.loc[3271849]['track_ids']

In [None]:
most_popular

In [6]:
tracks_to_suggest = most_popular.index.values

In [None]:
tracks_to_suggest[:10].tolist()

In [None]:
target_playlists

In [None]:
predictions = pd.DataFrame(target_playlists)
predictions.index = target_playlists['playlist_id']
predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]

In [None]:
predictions

In [None]:
"""for pl_id in target_playlists['playlist_id']:
    count = 0
    i = 0
    pred = []
    while count < 5:
        if tracks_to_suggest[i] not in tracks_in_playlist.loc[pl_id]['track_ids']:
            # Predict track i
            # IMPORTANT: should we check if the track to suggest is in target_tracks?
            pred.append(tracks_to_suggest[i])
            count += 1
        i += 1
    predictions[pl_id] = pred
    """
for it,row in target_playlists.iterrows():
    count = 0
    i = 0
    pred = []
    while count < 5:
        if tracks_to_suggest[i] not in tracks_in_playlist.loc[row['playlist_id']]['track_ids']:
            # Predict track i
            # IMPORTANT: should we check if the track to suggest is in target_tracks?
            pred.append(tracks_to_suggest[i])
            count += 1
        i += 1
    predictions.loc[row['playlist_id']] = predictions.loc[row['playlist_id']].set_value('track_ids', np.array(pred))

In [None]:
predictions

In [None]:
evaluate(test, predictions)

## Content based

In [7]:
counter = 0
def reduceCount(prev, l):
    global counter
    for el in l:
        if el not in prev:
            prev[el] = [counter]
        else:
            prev[el] += [counter]
    counter += 1
    return prev

# key: tag_id, value: [track_idx...]
distinct_tags = functools.reduce(reduceCount, tracks['tags'], dict())

In [None]:
len(distinct_tags)

In [None]:
len(tracks)

In [8]:
most_popular_tags = [k for k,v in sorted([(k, len(v)) for k, v in distinct_tags.items()], key=lambda tup: tup[1], reverse=True)]
most_popular_tags

[205245,
 115355,
 70618,
 11056,
 81223,
 189631,
 154891,
 46208,
 11242,
 115752,
 43212,
 11957,
 218701,
 3982,
 117167,
 122769,
 84597,
 54087,
 103394,
 35060,
 226723,
 76913,
 70625,
 3668,
 4425,
 107401,
 6120,
 195173,
 193464,
 105199,
 31015,
 64978,
 3424,
 227682,
 255208,
 186445,
 107398,
 116047,
 170251,
 194264,
 50604,
 115684,
 198998,
 177424,
 195456,
 70251,
 153432,
 89467,
 204710,
 24358,
 144067,
 92799,
 144192,
 97480,
 191177,
 191251,
 31253,
 201327,
 100728,
 50247,
 57528,
 103055,
 237214,
 193395,
 97598,
 109806,
 169455,
 35233,
 72354,
 87197,
 194413,
 239459,
 45739,
 83064,
 254186,
 235879,
 154941,
 236955,
 38403,
 112283,
 249989,
 191642,
 100238,
 42894,
 271235,
 216938,
 153731,
 157349,
 136980,
 175331,
 50764,
 122106,
 60046,
 68687,
 68498,
 3838,
 64267,
 55854,
 29723,
 116155,
 190991,
 61837,
 97621,
 66729,
 81211,
 11191,
 215342,
 80989,
 65509,
 71028,
 190049,
 84911,
 189007,
 190112,
 91383,
 219398,
 33826,
 212982,

In [9]:
from scipy.sparse import *
BEST_TAGS = 30
most_popular_tags_best = most_popular_tags[0:BEST_TAGS]

In [None]:
"""ICM = lil_matrix((BEST_TAGS,len(tracks)))
for tag_it, tag_id in enumerate(most_popular_tags_best):
    l = distinct_tags[tag_id]
    for track_idx in l:
        ICM[tag_it,track_idx] = 1
        
# ICM["index of tag in most_popular_tags_best", "index of track in tracks"]"""

In [10]:
tracks = get_track_tags_binary(tracks, cut_off=BEST_TAGS, relevant_tags=most_popular_tags_best)

In [11]:
bt = [ar.tolist() for ar in tracks['binary_tags'].values]
bt

[[0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [12]:
%%time
from sklearn.neighbors import NearestNeighbors

X = np.array(bt)#np.array([[0, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree', n_jobs=-1).fit(X)

CPU times: user 39.1 s, sys: 433 ms, total: 39.5 s
Wall time: 39.6 s


In [None]:
distances, indices = nbrs.kneighbors(np.array([tracks['binary_tags'][0].tolist()]))
indices

In [13]:
a = tracks['binary_tags']
a = a.apply(lambda l: l.tolist())

In [14]:
a = a.tolist()

In [15]:
b = a

In [16]:
%%time
r1,r2 = nbrs.kneighbors(np.array(b))

CPU times: user 13min 16s, sys: 2.32 s, total: 13min 19s
Wall time: 3min 36s


In [None]:
"""S = lil_matrix((len(target_playlists),len(tracks)))
for tag_it, tag_id in enumerate(most_popular_tags_best):
    l = distinct_tags[tag_id]
    for track_idx in l:
        ICM[tag_it,track_idx] = 1
        
# ICM["index of tag in most_popular_tags_best", "index of track in tracks"]"""

In [None]:
14*8493/200

In [None]:
target_playlists.loc[1863751]

In [None]:
predictions = pd.DataFrame(target_playlists)
predictions.index = target_playlists['playlist_id']
predictions['track_ids'] = [np.array([]) for i in range(len(predictions))]

for it,row in target_playlists.iterrows():
    # find most similar tracks
    probable_tracks = {}
    for tr_id in tracks_in_playlist.loc[row['playlist_id']]['track_ids']:
        if len(tracks[tracks["track_id"] == tr_id].index) > 0:
            for i in r2[tracks[tracks["track_id"] == tr_id].index[0]]:
                if i not in probable_tracks:
                    probable_tracks[i] = 1
                else:
                    probable_tracks[i] += 1
            #probable_tracks[] = np.union1d(probable_tracks, r2[tracks[tracks["track_id"] == tr_id].index[0]].tolist())
    for k,v in probable_tracks.items():
        probable_tracks[k] = v/len(tracks_in_playlist.loc[row['playlist_id']]['track_ids'])
    probable_tracks = [k for k,v in sorted([(k, v) for k, v in probable_tracks.items()], key=lambda tup: tup[1], reverse=True)]
    
    i = 0
    k = 0
    pred=[]
    count = 0
    while count < 5:
        if len(probable_tracks) > i and probable_tracks[i] not in tracks_in_playlist.loc[row['playlist_id']]['track_ids']:
            # Predict track i
            # IMPORTANT: should we check if the track to suggest is in target_tracks?
            pred.append(tracks.iloc[probable_tracks[i]]['track_id'])
            i += 1
        else:
            pred.append(tracks_to_suggest[k])
            k += 1
        count += 1
    
    predictions.loc[row['playlist_id']] = predictions.loc[row['playlist_id']].set_value('track_ids', np.array(pred))

In [None]:
predictions.iloc[:3250]

In [18]:
probable_tracks = {}
#probable_tracks = []
for tr_id in tracks_in_playlist.loc[6805224]['track_ids']:
    if len(tracks[tracks["track_id"] == tr_id].index) > 0:
        for i in r2[tracks[tracks["track_id"] == tr_id].index[0]]:
            if i not in probable_tracks:
                probable_tracks[i] = 1
            else:
                probable_tracks[i] += 1
        #probable_tracks[] = np.union1d(probable_tracks, r2[tracks[tracks["track_id"] == tr_id].index[0]].tolist())
for k,v in probable_tracks.items():
    probable_tracks[k] = v/len(tracks_in_playlist.loc[6805224]['track_ids'])
probable_tracks = [k for k,v in sorted([(k, v) for k, v in probable_tracks.items()], key=lambda tup: tup[1], reverse=True)]
i = 0
k = 0
pred=[]
count = 0
while count < 5:
    if len(probable_tracks) > i and probable_tracks[i] not in tracks_in_playlist.loc[row['playlist_id']]['track_ids']:
        # Predict track i
        # IMPORTANT: should we check if the track to suggest is in target_tracks?
        pred.append(tracks.iloc[probable_tracks[i]]['track_id'])
        i += 1
    else:
        pred.append(tracks_to_suggest[k])
        k += 1
    count += 1
pred

[1563309, 1363985, 3705881, 1595978, 3779477]

In [None]:
tracks_to_suggest[0]

In [None]:
tracks[tracks["track_id"] == 1097177]

In [None]:
for tr_id in tracks_in_playlist.loc[6805224]['track_ids']:
    if len(tracks[tracks["track_id"] == tr_id].index) > 0:
        print(tr_id)