# Million Song Dataset Taste Profile

[Taste profile dataset](http://labrosa.ee.columbia.edu/millionsong/tasteprofile) contains real user - play counts from undisclosed users, with following statistics:

* 1,019,318 unique users
* 384,546 unique MSD songs
* 48,373,586 user - song - play count triplets

This is the script that subsamples the full dataset and splits it into non-overlapping training, validation, test sets. This subset is used in the paper: ["modeling user exposure in recommendation"](http://arxiv.org/abs/1510.07025).

This notebook was based on [this one](https://github.com/dawenl/expo-mf/blob/master/src/processTasteProfile.ipynb).

In [84]:
import json
import os
import sqlite3

import numpy as np
import pandas as pd

In [85]:
# Change this to wherever you keep the data
TPS_DIR = '/home/cfragada/source/Postdoc/KL_screening/datasets/Recommendation/tasteprofile' #'/home/waldorf/dawen.liang/data/tasteprofile/'

# The dataset can be obtained here:
# http://labrosa.ee.columbia.edu/millionsong/sites/default/files/challenge/train_triplets.txt.zip
TP_file = os.path.join(TPS_DIR, 'train_triplets.txt')

# track_metadata.db contains all the metadata, which is not required to subsample the data, but only used when 
# referring to the actual information about particular pieces (e.g. artist, song name, etc.)
# Available here: http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/track_metadata.db
md_dbfile = os.path.join(TPS_DIR, 'track_metadata.db')

In [86]:
#tp = pd.read_table(TP_file, header=None, names=['uid', 'sid', 'count'])
tp = pd.read_csv(TP_file, header=None, names=['uid', 'sid', 'count'], sep='\t')

## Filter out "inactive" users and songs
* Only keep the users who listened to at least 20 songs and the songs that are listened to by at least 50 users

### Get the user-playcount

In [87]:
# We only keep songs that are listened to by at least MIN_SONG_COUNT users and users who have listened 
# to at least MIN_USER_COUNT songs

MIN_USER_COUNT = 20
MIN_SONG_COUNT = 50

In [88]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id, 'count']].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=MIN_USER_COUNT, min_sc=MIN_SONG_COUNT):
    # Only keep the triplets for songs which were listened to by at least min_sc users. 
    songcount = get_count(tp, 'sid')
    tp = tp[tp['sid'].isin(songcount.index[songcount >= min_sc])]
    
    # Only keep the triplets for users who listened to at least min_uc songs
    # After doing this, some of the songs will have less than min_uc users, but should only be a small proportion
    usercount = get_count(tp, 'uid')
    tp = tp[tp['uid'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and songcount after filtering
    usercount, songcount = get_count(tp, 'uid'), get_count(tp, 'sid') 
    return tp, usercount, songcount

In [89]:
tp, usercount, songcount = filter_triplets(tp)

In [90]:
sparsity_level = float(tp.shape[0]) / (usercount.shape[0] * songcount.shape[0])
print "After filtering, there are %d triplets from %d users and %d songs (sparsity level %.3f%%)" % (tp.shape[0], 
                                                                                                      usercount.shape[0], 
                                                                                                      songcount.shape[0], 
                                                                                                      sparsity_level * 100)

After filtering, there are 39730795 triplets from 629112 users and 98485 songs (sparsity level 0.064%)


### CSV export

In [123]:
if True:
    # Save original song and user identifiers
    with open(os.path.join(TPS_DIR, 'filtered_uid.txt'), 'w') as f:
        for uid in usercount.index:
            f.write('%s\n' % uid)
    with open(os.path.join(TPS_DIR, 'filtered_sid.txt'), 'w') as f:
        for sid in songcount.index:
            f.write('%s\n' % sid)        

        
    # Replace song and user identifiers by numbers
    song2id = dict((sid, i) for (i, sid) in enumerate(songcount.index))
    user2id = dict((uid, i) for (i, uid) in enumerate(usercount.index))

    def numerize(tp):
        uid = map(lambda x: user2id[x], tp['uid'])
        sid = map(lambda x: song2id[x], tp['sid'])
        tp['uid'] = uid
        tp['sid'] = sid
        return tp

    tp_copy = tp.copy()

    numerize(tp_copy).to_csv(os.path.join(TPS_DIR, 'TasteProfile_train_triplets_filtered.csv'), index=False)
    del tp_copy

## Subsample ~N songs and ~M users:

* First sample ~1.1*M users based on listening count, only keep the data with those 1.1*M users
* Then sample ~1.1*N songs from the pre-selected user listening history based on listening count
* Only keep the users who listened to at least 20 songs and the songs that are listened to by at least 50 users

In [91]:
#2 options for choosing the subset of users: 
#  1) keep_max = True, keeps the users and songs with HIGHEST listening count
#  2) keep_max = False, random with probability of picking a user or song is porportional to the listening count

keep_max = False

In [92]:
#UNCOMMENT the chosen subsampling rate

# ~ 20k x 200k
# mode_str = '20k200k' #(nbSongs x nbUsers)
# if keep_max:
#     n_users = 200000
#     n_songs = 20000
# else:
#     n_users = 250000
#     n_songs = 25000

# ~ 10k x 100k
#mode_str = '10k100k'
# if keep_max:
#     n_users = 100000
#     n_songs = 10000
# else:
#     n_users = 110000
#     n_songs = 11000

# ~ 5k x 50k
mode_str = '5k50k'
if keep_max:
    n_users = 50000
    n_songs = 5000
else:
    n_users = 80000
    n_songs = 6000

np.random.seed(98765)

In [93]:
# Pick n_users users
if keep_max:
    sorted_usercount = usercount.sort_values(ascending=False)
    unique_uid = sorted_usercount.index[:n_users]
else:
    unique_uid = usercount.index
    p_users = usercount / usercount.sum()
    idx = np.random.choice(len(unique_uid), size=n_users, replace=False, p=p_users.tolist())
    unique_uid = unique_uid[idx]

In [94]:
tp = tp[tp['uid'].isin(unique_uid)]

In [95]:
# Pick n_songs songs
if keep_max:
    sorted_songcount = songcount.sort_values(ascending=False)
    unique_sid = sorted_songcount.index[:n_songs]
else:
    unique_sid = songcount.index
    p_songs = songcount / songcount.sum()
    idx = np.random.choice(len(unique_sid), size=n_songs, replace=False, p=p_songs.tolist())
    unique_sid = unique_sid[idx]

In [96]:
tp = tp[tp['sid'].isin(unique_sid)]

In [97]:
# Filter out users and songs below a minimum listening count
tp, usercount, songcount = filter_triplets(tp, min_uc=20, min_sc=50)
unique_uid = usercount.index
unique_sid = songcount.index

In [98]:
sparsity_level = float(tp.shape[0]) / (usercount.shape[0] * songcount.shape[0])
print "After subsampling and filtering, there are %d triplets from %d users and %d songs (sparsity level %.3f%%)" % \
(tp.shape[0], usercount.shape[0], songcount.shape[0], sparsity_level * 100)

After subsampling and filtering, there are 2131575 triplets from 44062 users and 4835 songs (sparsity level 1.001%)


### CSV export

In [99]:
# Save original song and user identifiers
with open(os.path.join(TPS_DIR, 'reduced'+mode_str+'_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)
        
with open(os.path.join(TPS_DIR, 'reduced'+mode_str+'_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

# Replace song and user identifiers by numbers
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

def numerize(tp):
    uid = map(lambda x: user2id[x], tp['uid'])
    sid = map(lambda x: song2id[x], tp['sid'])
    tp['uid'] = uid
    tp['sid'] = sid
    return tp

#tp_copy = tp.copy();

if keep_max:
    numerize(tp).to_csv(os.path.join(TPS_DIR, 'TasteProfile_train_triplets_reduced'+mode_str+'_max.csv'), index=False)
else:
    numerize(tp).to_csv(os.path.join(TPS_DIR, 'TasteProfile_train_triplets_reduced'+mode_str+'.csv'), index=False)