## 20Mn data MovieLens Experiment

An experiment on 20mn MovieLens Dataset.

The core idea is to use rnn to process sequence log data, to replace trained user embedding on crossfiltering nn model.

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

In [2]:
from ray.lprint import lprint
l = lprint("experiment with RNN+CF on movielens 20m data")

[task:experiment with RNN+CF on movielens 20m data>>start]<2018-10-22_10:10:33|0s,0s>	


In [3]:
CUDA = torch.cuda.is_available()
SEQ_LEN = 19 # sequence length
DIM = 100 # hidden vector lenth, embedding length
l.p("has GPU cuda",CUDA)

[has GPU cuda]<2018-10-22_10:10:37|3s,3s>	True


In [4]:
# %ls /data/ml-20m

In [5]:
DATA = "/data/ml-20m/ratings.csv"

In [6]:
l.p("loading csv file", DATA)
rate_df = pd.read_csv(DATA)
l.p("csv file loaded")

[loading csv file]<2018-10-22_10:10:41|4s,8s>	/data/ml-20m/ratings.csv
[csv file loaded]<2018-10-22_10:10:49|7s,16s>	


In [7]:
len(rate_df)

20000263

In [8]:
rate_df.groupby("userId").count()["movieId"].min()
# The minimum number of movies a user have rated

20

In [9]:
userId = list(set(rate_df["userId"]))
movieId = list(set(rate_df["movieId"]))
print("total number of users and movies:\t",len(userId),"\t",len(movieId))

total number of users and movies:	 138493 	 26744


In [10]:
l.p("making dictionary")
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

[making dictionary]<2018-10-22_10:10:53|3s,19s>	


In [11]:
# Translating original index to the new index
rate_df["movieIdx"] = rate_df.movieId.apply(lambda x:m2i[x]).astype(int)
rate_df["userIdx"] = rate_df.userId.apply(lambda x:u2i[x]).astype(int)
rate_df["rating"] = rate_df["rating"]/5

### Train /Valid Split: K fold Validation 

In [12]:
l.p("generating groubby slice")
def get_user_trail(rate_df):
    return rate_df.sort_values(by=["userId","timestamp"]).groupby("userId")
    #gb.apply(lambda x:x.sample(n = 20, replace = False))
gb = get_user_trail(rate_df)

[generating groubby slice]<2018-10-22_10:10:04|791s,811s>	


In [13]:
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import math

In [14]:
KEEP_CONSEQ = True

if KEEP_CONSEQ:
    # keep the consequtivity among the items the user has rated
    def sample_split(x):
        sample_idx = math.floor(np.random.rand()*(len(x) - SEQ_LEN - 1))
        seq_and_y = x[sample_idx:sample_idx + SEQ_LEN+1]
        return seq_and_y
else:
    # randomly pick the right amount of sample from user's record
    pick_k = np.array([0]*SEQ_LEN +[1])==1

    def sample_split(x):
        sampled = x.sample(n = 20, replace = False)
        seq = sampled.head(19).sort_values(by="timestamp")
        y = sampled[pick_k]
        return pd.concat([seq,y])

class rnn_record(Dataset):
    def __init__(self, gb):
        """
        A pytorch dataset object designed to group logs into user behavior sequence
        """
        self.gb = gb
        self.make_seq()
    
    def make_seq(self):
        """
        Resample the data
        """
        self.all_seq = self.gb.apply(sample_split)
        
    def __len__(self):
        return len(self.gb)
        
    def __getitem__(self,idx):
        """
        next(generator) will spit out the 'return' of this function
        this is a single row in the batch
        """
        df = self.all_seq.loc[idx]
        seq = df.head(SEQ_LEN)[["movieIdx","rating"]].values
        targ = df.head(SEQ_LEN+1).tail(1)[["movieIdx","rating"]].values
        targ_v, targ_y =targ[:,0], targ[:,1]
        return idx,seq,targ_v,targ_y

In [15]:
# Testing data generator

# data_gb = get_user_trail(rate_df)
# rr = rnn_record(data_gb)
# rr.all_seq

# dl = DataLoader(rr,shuffle=True,batch_size=1)
# gen = iter(dl)
# next(gen)

In [16]:
### Model

class mLinkNet(nn.Module):
    def __init__(self, hidden_size,v_size):
        """
        mLinkNet, short for missing link net
        """
        super(mLinkNet,self).__init__()
        self.hidden_size = hidden_size
        self.v_size = v_size
        self.emb = nn.Embedding(v_size,hidden_size)
        
        self.rnn = nn.GRU(input_size = self.hidden_size+1,
                          hidden_size= hidden_size+1,
                          num_layers=1,
                          batch_first = True,
                          dropout=0)
        
        self.mlp = nn.Sequential(*[
            nn.Dropout(.3),
            nn.Linear(hidden_size*2 + 1, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.Linear(256,1,bias=False),
            nn.Sigmoid(),
        ])
    
    def forward(self,seq,targ_v):
        seq_vec = torch.cat([self.emb(seq[:,0].long()),
                             seq[:,1].unsqueeze(-1).float()], dim=2)
        output, hn = self.rnn(seq_vec)
        x = torch.cat([hn.squeeze(0),self.emb(targ_v.long()).squeeze(1)],dim=1)
        return self.mlp(x)

In [17]:

def action(*args,**kwargs):
    # get data from data feeder
    idx,seq,targ_v,y = args[0]
    if CUDA:
        seq,targ_v,y = seq.cuda(),targ_v.cuda(),y.cuda()
    y = y.float()
    
    # Clear the Jacobian Matrix
    opt.zero_grad()
    
    # Predict y hat
    y_ = mln(seq, targ_v)
    # Calculate Loss
    loss = loss_func(y_,y)
    
    # Backward Propagation
    loss.backward()
    opt.step()
    # Mean Absolute Loss as print out metrics
    mae = torch.mean(torch.abs(y_-y))
    if kwargs["ite"] == train_len - 1: # resample the sequence
        trainer.train_data.dataset.make_seq()
    return {"loss":loss.item(),"mae":mae.item()}

def val_action(*args,**kwargs):
    """
    A validation step
    Exactly the same like train step, but no learning, only forward pass
    """
    idx,seq,targ_v,y = args[0]
    if CUDA:
        seq,targ_v,y = seq.cuda(),targ_v.cuda(),y.cuda()
    y = y.float()
    
    y_ = mln(seq, targ_v)
    
    loss = loss_func(y_,y)
    mae = torch.mean(torch.abs(y_-y))
    if kwargs["ite"] == valid_len - 1:
        trainer.val_data.dataset.make_seq()
    return {"loss":loss.item(),"mae":mae.item()}


In [18]:
l.p("making train/test split")
user_count = len(userId)
K = 2
valid_split = dict({})
random = np.random.rand(user_count)
from ray.matchbox import Trainer

l.p("start training")
for fold in range(K):
    valid_split = ((fold/K) < random)*(random <= ((fold+1)/K))
    train_idx = np.array(range(user_count))[~valid_split]
    valid_idx = np.array(range(user_count))[valid_split]

    train_df = rate_df[rate_df.userId.isin(train_idx)]
    valid_df = rate_df[rate_df.userId.isin(valid_idx)]
    
    # Since user id mapping doesn't matter any more.
    # It's easier to make a dataset with contineous user_id.
    train_u2i = dict((v,k) for k,v in enumerate(set(train_df.userId)))
    valid_u2i = dict((v,k) for k,v in enumerate(set(valid_df.userId)))
    train_df["userId"] = train_df.userId.apply(lambda x:train_u2i[x])
    valid_df["userId"] = valid_df.userId.apply(lambda x:valid_u2i[x])
    
    train_gb = get_user_trail(train_df)
    valid_gb = get_user_trail(valid_df)
    # ds = rnn_record(gb)
    l.p("generating dataset","train")
    train_ds = rnn_record(train_gb)
    l.p("generating dataset","valid")
    valid_ds = rnn_record(valid_gb)
    l.p("dataset generated")

    l.p("creating model")
    mln = mLinkNet(hidden_size = DIM, 
               v_size = len(movieId))
    if CUDA:
        l.p("loading model to GPU")
        torch.cuda.empty_cache()
        mln.cuda()

    opt = Adam(mln.parameters())
    loss_func = nn.MSELoss()
    trainer = Trainer(train_ds, val_dataset=valid_ds, batch_size=16, print_on=3)
    train_len = len(trainer.train_data)
    valid_len = len(trainer.val_data)
    l.p("train_len",train_len)
    l.p("valid_len",valid_len)
    trainer.action  = action
    trainer.val_action  = val_action
    
    l.p("fold training start", fold)
    trainer.train(12,name="rnn_cf_fold%s"%(fold))
    l.p("fold training finished",fold)
l.p("training finished")

[making train/test split]<2018-10-22_10:10:29|24s,835s>	
[start training]<2018-10-22_10:10:29|0s,835s>	


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[generating dataset]<2018-10-22_10:10:04|35s,870s>	train
[generating dataset]<2018-10-22_10:10:38|33s,904s>	valid
[dataset generated]<2018-10-22_10:10:11|32s,937s>	
[creating model]<2018-10-22_10:10:11|0s,937s>	
[loading model to GPU]<2018-10-22_10:10:11|0s,937s>	


  0%|          | 1/4327 [00:00<11:21,  6.34it/s]

[train_len]<2018-10-22_10:10:19|8s,945s>	4327
[valid_len]<2018-10-22_10:10:19|0s,945s>	4329
[fold training start]<2018-10-22_10:10:19|0s,945s>	0


⭐[ep_0_i_4325]	loss	0.034✨	mae	0.146: 100%|██████████| 4327/4327 [03:48<00:00,  4.82s/it]
😎[val_ep_0_i_4328]	loss	0.042😂	mae	0.160: 100%|██████████| 4329/4329 [05:39<00:00,  9.73s/it]
⭐[ep_1_i_4325]	loss	0.037✨	mae	0.152: 100%|██████████| 4327/4327 [03:49<00:00,  3.34s/it]
😎[val_ep_1_i_4328]	loss	0.041😂	mae	0.157: 100%|██████████| 4329/4329 [05:38<00:00,  9.85s/it]
⭐[ep_2_i_4325]	loss	0.038✨	mae	0.163: 100%|██████████| 4327/4327 [03:49<00:00,  3.34s/it]
😎[val_ep_2_i_4328]	loss	0.040😂	mae	0.155: 100%|██████████| 4329/4329 [05:30<00:00,  9.53s/it]
⭐[ep_3_i_4325]	loss	0.059✨	mae	0.172: 100%|██████████| 4327/4327 [03:50<00:00,  3.37s/it]
😎[val_ep_3_i_4328]	loss	0.039😂	mae	0.152: 100%|██████████| 4329/4329 [05:41<00:00, 10.19s/it]
⭐[ep_4_i_4325]	loss	0.046✨	mae	0.164: 100%|██████████| 4327/4327 [03:50<00:00,  3.33s/it]
😎[val_ep_4_i_4328]	loss	0.039😂	mae	0.153: 100%|██████████| 4329/4329 [05:40<00:00,  9.89s/it]
⭐[ep_5_i_4325]	loss	0.050✨	mae	0.168: 100%|██████████| 4327/4327 [03:49<00:00,  

[fold training finished]<2018-10-22_12:12:44|6805s,7750s>	0
[generating dataset]<2018-10-22_12:12:19|35s,7786s>	train
[generating dataset]<2018-10-22_12:12:53|33s,7819s>	valid
[dataset generated]<2018-10-22_12:12:27|34s,7853s>	
[creating model]<2018-10-22_12:12:27|0s,7853s>	
[loading model to GPU]<2018-10-22_12:12:27|0s,7854s>	


⭐[ep_0_i_2]	loss	0.109✨	mae	0.290:   0%|          | 2/4329 [00:00<04:45, 15.14it/s]

[train_len]<2018-10-22_12:12:31|4s,7858s>	4329
[valid_len]<2018-10-22_12:12:31|0s,7858s>	4327
[fold training start]<2018-10-22_12:12:31|0s,7858s>	1


⭐[ep_0_i_4328]	loss	0.036✨	mae	0.159: 100%|██████████| 4329/4329 [03:51<00:00,  5.05s/it]
😎[val_ep_0_i_4326]	loss	0.043😂	mae	0.166: 100%|██████████| 4327/4327 [05:28<00:00,  9.65s/it]
⭐[ep_1_i_4328]	loss	0.044✨	mae	0.172: 100%|██████████| 4329/4329 [03:49<00:00,  4.87s/it]
😎[val_ep_1_i_4326]	loss	0.041😂	mae	0.156: 100%|██████████| 4327/4327 [05:28<00:00,  9.78s/it]
⭐[ep_2_i_4328]	loss	0.031✨	mae	0.148: 100%|██████████| 4329/4329 [03:50<00:00,  5.00s/it]
😎[val_ep_2_i_4326]	loss	0.039😂	mae	0.154: 100%|██████████| 4327/4327 [05:34<00:00, 10.08s/it]
⭐[ep_3_i_4328]	loss	0.050✨	mae	0.170: 100%|██████████| 4329/4329 [03:49<00:00,  3.24s/it]
😎[val_ep_3_i_4326]	loss	0.039😂	mae	0.154: 100%|██████████| 4327/4327 [05:38<00:00,  9.91s/it]
⭐[ep_4_i_4328]	loss	0.044✨	mae	0.169: 100%|██████████| 4329/4329 [03:51<00:00,  5.04s/it]
😎[val_ep_4_i_4326]	loss	0.039😂	mae	0.154: 100%|██████████| 4327/4327 [05:42<00:00,  9.76s/it]
⭐[ep_5_i_4328]	loss	0.042✨	mae	0.159: 100%|██████████| 4329/4329 [03:51<00:00,  

[fold training finished]<2018-10-22_14:02:13|6761s,14620s>	1
[training finished]<2018-10-22_14:02:13|0s,14620s>	


In [19]:
torch.save(mln.state_dict(),"/data/rnn_cf_0.0.1.npy")