In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import uuid
import scipy.sparse as sp
import numpy as np

In [150]:
df = pd.read_csv('../Data/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv', sep='\t',
                 header=None, names=['user', 'timestamp', 'musicbrainz_artist_id', 'art_name', 'track_id', 'track_name'], on_bad_lines='skip')

In [151]:
df.shape

(19098853, 6)

In [152]:
df.tail()

Unnamed: 0,user,timestamp,musicbrainz_artist_id,art_name,track_id,track_name
19098848,user_001000,2008-01-27T22:02:35Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,a490cabc-1e5c-4807-86c7-740c31a50009,Please Be Patient With Me
19098849,user_001000,2008-01-27T21:56:52Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3e92e447-9e1f-440d-bc00-6734469880c5,Shake It Off
19098850,user_001000,2008-01-27T21:52:36Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,93d044e6-1bbb-46a6-ac8e-283382a89e6f,Side With The Seeds
19098851,user_001000,2008-01-27T21:49:12Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,5ac4386f-6146-4389-a762-4b43f362d2c8,Sky Blue Sky
19098852,user_001000,2008-01-27T21:43:14Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3acc99bc-a349-420f-ad28-7095eb3533c9,Impossible Germany


In [153]:
from tqdm import tqdm
tqdm.pandas()

In [154]:
def is_valid_uuid(val):
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False

In [155]:
df.dropna(subset=['musicbrainz_artist_id'], inplace=True)

In [156]:
df['valid_uuid'] = df['musicbrainz_artist_id'].progress_apply(is_valid_uuid)

100%|██████████| 18498005/18498005 [00:31<00:00, 578808.28it/s]


In [157]:
df = df[df['valid_uuid'] == True]

In [158]:
user_by_artist = defaultdict(list)
for user, artist in tqdm(zip(df['user'], df['musicbrainz_artist_id']), total=df.shape[0]):
    user_by_artist[user].append(artist)

  0%|          | 0/18498005 [00:00<?, ?it/s]

100%|██████████| 18498005/18498005 [00:06<00:00, 2885807.93it/s]


In [159]:
len(user_by_artist)

992

In [160]:
for k, v in user_by_artist.items():
    user_by_artist[k] = list(set(v))

In [161]:
users_gt_20 = {k: v for k, v in user_by_artist.items() if len(v) >= 20}

In [162]:
len(users_gt_20)

979

In [163]:
df = df[df['user'].isin(users_gt_20)]

In [164]:
df.shape

(18497299, 7)

In [177]:
user_label = []
user_label_dict = {}
count = 0
for user in tqdm(df.user):
    if user_label_dict.get(user) == None:
        count += 1
        user_label_dict[user] = count
    user_label.append(user_label_dict[user])

100%|██████████| 18497299/18497299 [00:06<00:00, 2687475.97it/s]


In [179]:
df['user'] = user_label

In [180]:
mb_id_categorical = []
mb_dict = {}
count = 0
for r in tqdm(df.musicbrainz_artist_id, total=df.shape[0]):
    if mb_dict.get(r) == None:
        count += 1
        mb_dict[r] = count
    mb_id_categorical.append(mb_dict[r])

100%|██████████| 18497299/18497299 [00:07<00:00, 2586734.54it/s]


In [181]:
df['mb_id_categorical'] = mb_id_categorical

In [182]:
df.head()

Unnamed: 0,user,timestamp,musicbrainz_artist_id,art_name,track_id,track_name,valid_uuid,mb_id_categorical
0,1,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,True,1
1,1,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15),True,2
2,1,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15),True,2
3,1,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15),True,2
4,1,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15),True,2


In [183]:
user_mb_timestamp = defaultdict(list)

for user, mb_id, timestamp in tqdm(zip(df['user'], df['mb_id_categorical'], df['timestamp']), total=df.shape[0]):
    user_mb_timestamp[(user, mb_id)].append(timestamp)

100%|██████████| 18497299/18497299 [00:13<00:00, 1383056.91it/s]


In [186]:
user, mb_id, timestamp, plays_qtt = [], [], [], []
for k in tqdm(user_mb_timestamp.keys()):
    user.append(k[0])
    mb_id.append(k[1])
    t = user_mb_timestamp[k]
    t.sort(reverse=True)
    try:
        timestamp.append(t[0])
    except:
        print(k)
        break
    plays_qtt.append(len(t))

  0%|          | 0/819388 [00:00<?, ?it/s]

100%|██████████| 819388/819388 [00:00<00:00, 850321.53it/s]


In [209]:
ratings = []
for u, m, t, p in tqdm(zip(user, mb_id, timestamp, plays_qtt), total=len(user)):
    ratings.append((u, m, t, p))

  0%|          | 0/819388 [00:00<?, ?it/s]

100%|██████████| 819388/819388 [00:00<00:00, 1770701.88it/s]


In [211]:
ratings[-1]

(979, 10698, '2008-01-29T06:21:14Z', 23)

In [212]:
def make_test_ratings(ratings):
    test_ratings = {}
    for rating in tqdm(ratings):
        user, item, timestamp, plays = int(
            rating[0])-1, int(rating[1])-1, rating[2], int(rating[3])
        if test_ratings.get(user) != None:
            if test_ratings[user][1] > timestamp:
                test_ratings[user] = (item, timestamp)
        else:
            test_ratings[user] = (item, timestamp)

    rating_list = []
    for user in test_ratings.keys():
        rating_list.append([user, test_ratings[user][0]])
    return rating_list, test_ratings

In [213]:
test_ratings, test_ratings_dict = make_test_ratings(ratings)

100%|██████████| 819388/819388 [00:00<00:00, 1489400.07it/s]


In [249]:
def make_train_ratings(ratings, test_ratings_dict):
    train_ratings = []
    num_users, num_items = 0, 0
    for rating in tqdm(ratings):
        user, item = rating[0], rating[1]
        num_users = max(num_users, user)
        num_items = max(num_items, item)

    matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
    for rating in tqdm(ratings):
        user, item, timestamp, plays_qtt = rating[0] - \
            1, rating[1]-1, rating[2], rating[3]
        if test_ratings_dict[user][0] != item:
            matrix[user, item] = 1.0
        train_ratings.append([user, item, plays_qtt])

    return matrix, train_ratings

In [251]:
matrix_ratings, train_ratings = make_train_ratings(ratings, test_ratings_dict)

  0%|          | 0/819388 [00:00<?, ?it/s]

100%|██████████| 819388/819388 [00:00<00:00, 2420899.62it/s]
100%|██████████| 819388/819388 [00:08<00:00, 94054.44it/s] 


In [241]:
matrix_ratings

<Dictionary Of Keys sparse matrix of dtype 'float32'
	with 818409 stored elements and shape (979, 107293)>

In [242]:
import random

In [254]:
def make_test_negatives(train_matrix, test_ratings_dict):
    test_negatives = []
    for user in tqdm(range(0, train_matrix.shape[0]), total=train_matrix.shape[0]):
        range_items = range(0, train_matrix.shape[1])
        interected_items = set(train_matrix[user].nonzero()[1])
        available_numbers = [
            num for num in range_items if num not in interected_items]
        negatives = random.sample(available_numbers, 99)

        test_negatives.append(
            [(user, test_ratings_dict[user][0]), negatives])
    return test_negatives

In [255]:
test_negatives = make_test_negatives(matrix_ratings, test_ratings_dict)

  0%|          | 0/979 [00:00<?, ?it/s]

100%|██████████| 979/979 [00:32<00:00, 30.38it/s]


In [262]:
with open('./output/last-fm.train.rating', 'w') as f:
    for i in train_ratings:
        f.write('\t'.join([str(x) for x in i]) + '\n')

In [260]:
with open('./output/last-fm.test.rating', 'w') as f:
    for i in test_ratings:
        f.write('\t'.join([str(x) for x in i]) + '\n')

In [261]:
with open('./output/last-fm.test.negative', 'w') as f:
    for i in test_negatives:
        f.write(str(i[0]))
        f.write('\t')
        f.write('\t'.join([str(x) for x in i[1]]) + '\n')

In [248]:
total_elements = matrix_ratings.shape[0] * matrix_ratings.shape[1]
non_zero_elements = matrix_ratings.count_nonzero()
sparsity = (total_elements - non_zero_elements) / total_elements
sparsity

np.float64(0.9922085853761763)

In [238]:
matrix_ratings

<Dictionary Of Keys sparse matrix of dtype 'float32'
	with 818409 stored elements and shape (979, 107293)>