In [89]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse
import pandas as pd

In [90]:
DATA_DIR = "/opt/ml/input/data/train/"

In [91]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))

In [92]:
raw_data['rating'] = 1

In [93]:
raw_data.columns = ["userId", 'movieId', "timestamp", 'rating']

In [94]:
raw_data.head()

Unnamed: 0,userId,movieId,timestamp,rating
0,11,4643,1230782529,1
1,11,170,1230782534,1
2,11,531,1230782539,1
3,11,616,1230782542,1
4,11,2140,1230782563,1


In [95]:
raw_data['movieId'].nunique()

6807

In [82]:
len(raw_data)

5154471

In [83]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [84]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount['size'] >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [85]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=0)

In [86]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 1189122 watching events from 7177 users and 6807 movies (sparsity: 2.434%)


In [37]:
unique_uid = user_activity.index

np.random.seed(42)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [38]:
# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 1000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [39]:
print(len(tr_users), len(vd_users), len(te_users))

5177 1000 1000


In [40]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]


In [41]:
unique_sid = pd.unique(train_plays['movieId'])


In [42]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))


In [43]:
pro_dir = "./"

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [44]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(42)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 10 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [45]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [46]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)


0 users sampled
10 users sampled
20 users sampled
30 users sampled
40 users sampled
50 users sampled
60 users sampled
70 users sampled
80 users sampled
90 users sampled
100 users sampled
110 users sampled
120 users sampled
130 users sampled
140 users sampled
150 users sampled
160 users sampled
170 users sampled
180 users sampled
190 users sampled
200 users sampled
210 users sampled
220 users sampled


In [47]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [50]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

0 users sampled
10 users sampled
20 users sampled
30 users sampled
40 users sampled
50 users sampled
60 users sampled
70 users sampled
80 users sampled
90 users sampled
100 users sampled
110 users sampled
120 users sampled
130 users sampled
140 users sampled
150 users sampled
160 users sampled
170 users sampled
180 users sampled
190 users sampled
200 users sampled
210 users sampled
220 users sampled
230 users sampled


In [54]:
train_plays

Unnamed: 0,userId,movieId,timestamp,rating
0,11,4643,1230782529,1
1,11,170,1230782534,1
2,11,531,1230782539,1
3,11,616,1230782542,1
4,11,2140,1230782563,1
...,...,...,...,...
272458,7173,10,1228976448,1
272459,7173,2947,1228976451,1
272460,7173,3448,1228976479,1
272461,7173,1704,1228976482,1


In [52]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [99]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))
raw_data.columns = ["userId", 'movieId', "timestamp"]

infe_plays = raw_data.loc[raw_data['movieId'].isin(unique_sid)]
infe_sid = list(map(lambda x: show2id[x], infe_plays['movieId']))
infe_data = pd.DataFrame(data={'uid': infe_plays['userId'], 'sid': infe_sid}, columns = ['uid', 'sid'])
infe_data.to_csv(os.path.join(pro_dir, 'inference.csv'), index=False)

In [53]:
train_data = numerize(train_plays)
train_data

Unnamed: 0,uid,sid
0,4942,0
1,4942,1
2,4942,2
3,4942,3
4,4942,4
...,...,...
195728,2615,1238
195729,2615,2324
195730,2615,1507
195731,2615,598


In [125]:
train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

In [126]:
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

In [127]:
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

In [128]:
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

In [129]:
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

In [130]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)