In [2]:
import pandas as pd
import os
import sys
import numpy as np
from scipy import sparse
import bottleneck as bn

# Data Preparation - Preprocessing

**Note that only ML-20M dataset will be used. For other datasets please refer to the [paper](https://arxiv.org/pdf/1906.04281.pdf).**

For fair comparison between performances, preprocessing is kept same with the one used in the paper. The same preprocessing steps were used in [Variational Autoencoders for Collaborative Filtering](https://arxiv.org/abs/1802.05814). 

The code below is taken from [this repository](https://github.com/dawenl/vae_cf). This repository contains the implementation of the methods proposed in Variational Autoencoders for Collaborative Filtering, Liang et. al.

Please refer to the paper and the original repository for commentary on Preprocessing steps.


First, download the dataset at http://files.grouplens.org/datasets/movielens/ml-20m.zip and extract in ./ml-20m directory.

In [3]:
DATA_DIR = './ml-20m/'
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)

In [4]:
#  binarize the data (only keep ratings >= 4)
raw_data = raw_data[raw_data['rating'] > 3.5]

In [5]:
raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826
10,1,293,4.0,1112484703


In [6]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [7]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

In [8]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 9990682 watching events from 136677 users and 20720 movies (sparsity: 0.353%)


In [9]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [10]:
# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 10000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [11]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

unique_sid = pd.unique(train_plays['movieId'])
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [12]:
pro_dir = os.path.join(DATA_DIR, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [13]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [14]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled
0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [15]:
def numerize(tp):

    uid = [profile2id[x] for x in tp['userId']]

    sid = [show2id[x] for x in tp['movieId']]

    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])


train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

In [16]:
unique_sid = list()
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

n_items = len(unique_sid)

In [17]:
def load_train_data(csv_file):
    tp = pd.read_csv(csv_file)
    n_users = tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float64',
                             shape=(n_users, n_items))
    return data

In [18]:
train_data = load_train_data(os.path.join(pro_dir, 'train.csv'))

In [19]:
def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                             (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te

In [20]:
vad_data_tr, vad_data_te = load_tr_te_data(os.path.join(pro_dir, 'validation_tr.csv'),
                                           os.path.join(pro_dir, 'validation_te.csv'))

In [21]:

N = train_data.shape[0]
idxlist = range(N)

# training batch size
batch_size = 500
batches_per_epoch = int(np.ceil(float(N) / batch_size))

N_vad = vad_data_tr.shape[0]
idxlist_vad = range(N_vad)

# validation batch size (since the entire validation set might not fit into GPU memory)
batch_size_vad = 2000

# the total number of gradient updates for annealing
total_anneal_steps = 200000
# largest annealing parameter
anneal_cap = 0.2

# Evaluation Metric(s)

In the tests, only NDCG@100 metric is used. `NDCG_binary_at_k_batch` function is taken from [RaCT-CF](https://github.com/samlobel/RaCT_CF/tree/master/utils) repository and modified to work with PyTorch.

In [22]:
train_data.shape[0] + vad_data_te.shape[0] + vad_data_tr.shape[0]

136677

In [23]:
def spy_sparse2torch_sparse(data):
    """

    :param data: a scipy sparse csr matrix
    :return: a sparse torch tensor
    """
    samples=data.shape[0]
    features=data.shape[1]
    values=data.data
    coo_data=data.tocoo()
    indices=torch.LongTensor([coo_data.row,coo_data.col])
    t=torch.sparse.FloatTensor(indices,torch.from_numpy(values).float(),[samples,features])
    return t

# Tests using our implementation

Below our RaCT implementation is tested. Note that the implementation follows the pseudo-code in the paper.

Pre-training of actor -> Pre-training of critic -> alternative training of both

In [40]:
from RaCT import RaCT
# Initialize the RaCT algorithm object
ract = RaCT(train_data.shape[1],600,200,600,3,0.01)

## Pre-training of Actor

This part seems to more or less work.

We expect to see: 

1. ELBO decreases after 50th(Annealing) epoch.(Until 50,  Beta increases so ELBO may increase too.)
2. NLLL should decrease too. Decrease of NLLL indicates that we can better predict&construct the interaction distribution.
3. NDCG performance should go as high as 0.4. This is the reported result from the paper.

The results below show that 1 is satisfied whereas 2 and 3 are problematic. NLLL oscillates between 0.01 and 0.02. Although NDCG increase and show a learning performance, it doesn't go above 0.25. 

In [41]:
ract.pretrain_actor(train_data,500,0.2,150,50,vad_data_tr)

NLLL :  tensor(0.0201)
Elbo :  tensor(0.0201)




NDCG mean : tensor(0.0073)
NLLL :  tensor(0.0168)
Elbo :  tensor(0.0183)
NLLL :  tensor(0.0168)
Elbo :  tensor(0.2355)
NLLL :  tensor(0.0204)
Elbo :  tensor(0.3640)
NLLL :  tensor(0.0162)
Elbo :  tensor(2.4178)
NLLL :  tensor(0.0206)
Elbo :  tensor(0.1866)
NLLL :  tensor(0.0195)
Elbo :  tensor(1.0939)
NLLL :  tensor(0.0168)
Elbo :  tensor(0.4005)
NLLL :  tensor(0.0204)
Elbo :  tensor(0.6019)
NLLL :  tensor(0.0209)
Elbo :  tensor(1.0105)
NLLL :  tensor(0.0183)
Elbo :  tensor(0.2020)
NLLL :  tensor(0.0196)
Elbo :  tensor(0.5585)
NLLL :  tensor(0.0170)
Elbo :  tensor(0.3699)
NLLL :  tensor(0.0177)
Elbo :  tensor(0.4677)
NLLL :  tensor(0.0163)
Elbo :  tensor(0.4078)
NLLL :  tensor(0.0186)
Elbo :  tensor(0.6348)
NLLL :  tensor(0.0172)
Elbo :  tensor(0.3782)
NLLL :  tensor(0.0177)
Elbo :  tensor(0.5322)
NLLL :  tensor(0.0184)
Elbo :  tensor(0.5639)
NLLL :  tensor(0.0192)
Elbo :  tensor(0.4658)
NLLL :  tensor(0.0183)
Elbo :  tensor(0.3878)
NDCG mean : tensor(0.0891)
NLLL :  tensor(0.0166)
Elb

## Pre-training of Critic

This part works.

We expect to see:

1. A decrease in MSE Loss.

This is satisfied.

In [42]:
ract.pretrain_critic(train_data,500,50)



NDCG mean : tensor(0.2195)
MSE :  tensor(0.0954)
NDCG mean : tensor(0.2183)
MSE :  tensor(0.0925)
NDCG mean : tensor(0.2264)
MSE :  tensor(0.0834)
NDCG mean : tensor(0.2349)
MSE :  tensor(0.0794)
NDCG mean : tensor(0.2233)
MSE :  tensor(0.0814)
NDCG mean : tensor(0.2274)
MSE :  tensor(0.0746)
NDCG mean : tensor(0.2186)
MSE :  tensor(0.0775)
NDCG mean : tensor(0.2240)
MSE :  tensor(0.0728)
NDCG mean : tensor(0.2219)
MSE :  tensor(0.0709)
NDCG mean : tensor(0.2328)
MSE :  tensor(0.0655)
NDCG mean : tensor(0.2327)
MSE :  tensor(0.0660)
NDCG mean : tensor(0.2361)
MSE :  tensor(0.0626)
NDCG mean : tensor(0.2284)
MSE :  tensor(0.0585)
NDCG mean : tensor(0.2305)
MSE :  tensor(0.0574)
NDCG mean : tensor(0.2199)
MSE :  tensor(0.0591)
NDCG mean : tensor(0.2242)
MSE :  tensor(0.0503)
NDCG mean : tensor(0.2208)
MSE :  tensor(0.0507)
NDCG mean : tensor(0.2348)
MSE :  tensor(0.0460)
NDCG mean : tensor(0.2381)
MSE :  tensor(0.0456)
NDCG mean : tensor(0.2250)
MSE :  tensor(0.0449)
NDCG mean : tensor(0

## Alternative Training

This part is problematic.

We expect to see:

1. A decrease in NLLL, towards 0.
2. A decrease in -Critic score.(Note that the reported values are -Critic score, i.e. actor_loss)
3. An incrase in NDCG values.

The training is not stable. Sometimes it collapses and we see the opposites of the things above. The results below shows so. 

However, the performance can be stable&increasing. It is believed to be dependent on random seed.

In [43]:
ract.alternative_training(train_data,500,0.2,50)

Critic  0  ,  tensor(-0.2132)
NLLL :  tensor(0.0163)




NDCG mean : tensor(0.2234)
Critic  1  ,  tensor(-0.2131)
NLLL :  tensor(0.0148)
NDCG mean : tensor(0.2218)
Critic  2  ,  tensor(-0.2089)
NLLL :  tensor(0.0132)
NDCG mean : tensor(0.1846)
Critic  3  ,  tensor(-0.2127)
NLLL :  tensor(0.0125)
NDCG mean : tensor(0.1795)
Critic  4  ,  tensor(-0.2142)
NLLL :  tensor(0.0146)
NDCG mean : tensor(0.1620)
Critic  5  ,  tensor(-0.2106)
NLLL :  tensor(0.0146)
NDCG mean : tensor(0.1670)
Critic  6  ,  tensor(-0.2045)
NLLL :  tensor(0.0133)
NDCG mean : tensor(0.1630)
Critic  7  ,  tensor(-0.2042)
NLLL :  tensor(0.0156)
NDCG mean : tensor(0.1604)
Critic  8  ,  tensor(-0.2022)
NLLL :  tensor(0.0164)
NDCG mean : tensor(0.1495)
Critic  9  ,  tensor(-0.1901)
NLLL :  tensor(0.0191)
NDCG mean : tensor(0.1477)
Critic  10  ,  tensor(-0.1881)
NLLL :  tensor(0.0251)
NDCG mean : tensor(0.1452)
Critic  11  ,  tensor(-0.1852)
NLLL :  tensor(0.0374)
NDCG mean : tensor(0.1433)
Critic  12  ,  tensor(-0.1741)
NLLL :  tensor(0.0458)
NDCG mean : tensor(0.1487)
Critic  13

## Result from Stable Case

In [24]:
from RaCT import RaCT

ract = RaCT(train_data.shape[1],600,200,600,3,0.01)
ract.pretrain_actor(train_data,500,0.2,150,50,vad_data_tr)
ract.pretrain_critic(train_data,500,50)

NLLL :  tensor(0.0172)
Elbo :  tensor(0.0172)


  ndcg = torch.tensor(ndcg.reshape(-1,1),dtype=torch.float32)


NDCG mean : tensor(0.0069)
NLLL :  tensor(0.0172)
Elbo :  tensor(0.0185)
NLLL :  tensor(0.0163)
Elbo :  tensor(0.2104)
NLLL :  tensor(0.0203)
Elbo :  tensor(0.3037)
NLLL :  tensor(0.0160)
Elbo :  tensor(3.1468)
NLLL :  tensor(0.0186)
Elbo :  tensor(0.1416)
NLLL :  tensor(0.0217)
Elbo :  tensor(1.0518)
NLLL :  tensor(0.0181)
Elbo :  tensor(0.2699)
NLLL :  tensor(0.0201)
Elbo :  tensor(1.7373)
NLLL :  tensor(0.0182)
Elbo :  tensor(0.6741)
NLLL :  tensor(0.0184)
Elbo :  tensor(0.5111)
NLLL :  tensor(0.0184)
Elbo :  tensor(0.9967)
NLLL :  tensor(0.0169)
Elbo :  tensor(0.4408)
NLLL :  tensor(0.0171)
Elbo :  tensor(0.5412)
NLLL :  tensor(0.0194)
Elbo :  tensor(0.9130)
NLLL :  tensor(0.0171)
Elbo :  tensor(0.3875)
NLLL :  tensor(0.0171)
Elbo :  tensor(0.5539)
NLLL :  tensor(0.0159)
Elbo :  tensor(0.5763)
NLLL :  tensor(0.0170)
Elbo :  tensor(0.5997)
NLLL :  tensor(0.0170)
Elbo :  tensor(0.4496)
NLLL :  tensor(0.0158)
Elbo :  tensor(0.5529)
NDCG mean : tensor(0.0945)
NLLL :  tensor(0.0165)
Elb

  ndcg = torch.tensor(ndcg.reshape(-1,1),dtype=torch.float32)


NDCG mean : tensor(0.2324)
MSE :  tensor(0.1292)
NDCG mean : tensor(0.2239)
MSE :  tensor(0.1328)
NDCG mean : tensor(0.2242)
MSE :  tensor(0.1281)
NDCG mean : tensor(0.2235)
MSE :  tensor(0.1233)
NDCG mean : tensor(0.2262)
MSE :  tensor(0.1213)
NDCG mean : tensor(0.2335)
MSE :  tensor(0.1154)
NDCG mean : tensor(0.2311)
MSE :  tensor(0.1108)
NDCG mean : tensor(0.2234)
MSE :  tensor(0.1114)
NDCG mean : tensor(0.2303)
MSE :  tensor(0.0974)
NDCG mean : tensor(0.2168)
MSE :  tensor(0.1041)
NDCG mean : tensor(0.2324)
MSE :  tensor(0.0950)
NDCG mean : tensor(0.2243)
MSE :  tensor(0.0996)
NDCG mean : tensor(0.2238)
MSE :  tensor(0.0902)
NDCG mean : tensor(0.2293)
MSE :  tensor(0.0846)
NDCG mean : tensor(0.2317)
MSE :  tensor(0.0756)
NDCG mean : tensor(0.2222)
MSE :  tensor(0.0738)
NDCG mean : tensor(0.2264)
MSE :  tensor(0.0701)
NDCG mean : tensor(0.2125)
MSE :  tensor(0.0685)
NDCG mean : tensor(0.2288)
MSE :  tensor(0.0606)
NDCG mean : tensor(0.2213)
MSE :  tensor(0.0606)
NDCG mean : tensor(0

In [454]:
ract.alternative_training(train_data,500,0.2,50)



Epoch  0  ,  tensor(-0.3131)
NLLL :  tensor(0.0257)




NDCG mean : tensor(0.2548)
tensor(0.1433)
Epoch  1  ,  tensor(-0.3335)
NLLL :  tensor(0.0297)
NDCG mean : tensor(0.2487)
tensor(0.1393)
Epoch  2  ,  tensor(-0.3503)
NLLL :  tensor(0.0320)
NDCG mean : tensor(0.2388)
tensor(0.1535)
Epoch  3  ,  tensor(-0.3211)
NLLL :  tensor(0.0287)
NDCG mean : tensor(0.2373)
tensor(0.1447)
Epoch  4  ,  tensor(-0.3057)
NLLL :  tensor(0.0308)
NDCG mean : tensor(0.2329)
tensor(0.1425)
Epoch  5  ,  tensor(-0.2846)
NLLL :  tensor(0.0302)
NDCG mean : tensor(0.2169)
tensor(0.1283)
Epoch  6  ,  tensor(-0.2937)
NLLL :  tensor(0.0279)
NDCG mean : tensor(0.2103)
tensor(0.1268)
Epoch  7  ,  tensor(-0.2965)
NLLL :  tensor(0.0298)
NDCG mean : tensor(0.2031)
tensor(0.1399)
Epoch  8  ,  tensor(-0.2809)
NLLL :  tensor(0.0306)
NDCG mean : tensor(0.1986)
tensor(0.1233)
Epoch  9  ,  tensor(-0.2773)
NLLL :  tensor(0.0283)
NDCG mean : tensor(0.2056)
tensor(0.1222)
Epoch  10  ,  tensor(-0.2553)
NLLL :  tensor(0.0295)
NDCG mean : tensor(0.1926)
tensor(0.1195)
Epoch  11  ,  ten