## 20Mn data MovieLens Experiment

The original embedding+collaborative filtering + nn

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

In [2]:
from ray.lprint import lprint
l = lprint("experiment with CR on movielens 20m data to cross check the result against new technique")

[task:experiment with CR on movielens 20m data to cross check the result against new technique>>start]<2018-10-23_15:03:59|0s,0s>	


In [3]:
CUDA = torch.cuda.is_available()
SEQ_LEN = 19
DIM = 100
l.p("has GPU cuda",CUDA)

[has GPU cuda]<2018-10-23_15:03:00|0s,0s>	True


In [5]:
DATA = "/data/ml-20m/ratings.csv"

In [6]:
l.p("loading csv file", DATA)
rate_df = pd.read_csv(DATA)
l.p("csv file loaded")

[loading csv file]<2018-10-23_15:03:01|1s,1s>	/data/ml-20m/ratings.csv
[csv file loaded]<2018-10-23_15:03:09|8s,9s>	


In [7]:
len(rate_df)

20000263

In [8]:
userId = list(set(rate_df["userId"]))
movieId = list(set(rate_df["movieId"]))
print("total number of users and movies:\t",len(userId),"\t",len(movieId))

total number of users and movies:	 138493 	 26744


In [9]:
l.p("making dictionary")
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

[making dictionary]<2018-10-23_15:03:12|2s,12s>	


In [10]:
rate_df["movieIdx"] = rate_df.movieId.apply(lambda x:m2i[x]).astype(int)
rate_df["userIdx"] = rate_df.userId.apply(lambda x:u2i[x]).astype(int)
rate_df["rating"] = rate_df["rating"]/5

In [11]:
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
from ray.matchbox import Trainer,Arr_Dataset

#### Testing Generator

In [12]:
# dl = DataLoader(train_ds,batch_size=1, shuffle = True)
# print(len(dl))
# gen = iter(dl)
# next(gen)[0].size()

### Model structure

In [13]:
class cf_nn(nn.Module):
    def __init__(self, hidden_size,user_size,item_size):
        """
        Collaborative Filtering with Neural Network
        """
        super(cf_nn,self).__init__()
        self.hidden_size = hidden_size
        self.user_size = user_size
        self.item_size = item_size
        self.emb_u = nn.Embedding(self.user_size,self.hidden_size,)
        self.emb_v = nn.Embedding(self.item_size,self.hidden_size,)
        
        self.mlp = nn.Sequential(*[
            nn.Dropout(.3),
            nn.Linear(hidden_size*2, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.Linear(256,1,bias=False),
            nn.Sigmoid(),
        ])
    
    def forward(self,u_idx,v_idx):
        cated = torch.cat([self.emb_u(u_idx.long().squeeze(0)),
                             self.emb_v(v_idx.long().squeeze(0))], dim=1)
        return self.mlp(cated)

### Step Function

In [14]:

def action(*args,**kwargs):
    # get data from data feeder
    v_idx,u_idx,y = args[0]
    if CUDA:
        v_idx,u_idx,y = v_idx.cuda(),u_idx.cuda(),y.cuda().permute(1,0)
    y = y.float()
    
    # Clear the Jacobian Matrix
    opt.zero_grad()
    
    # Predict y hat
    y_ = cf_model(u_idx, v_idx)
    # Calculate Loss
    loss = loss_func(y_,y)
    
    # Backward Propagation
    loss.backward()
    opt.step()
    # Mean Absolute Loss as print out metrics
    mae = torch.mean(torch.abs(y_-y))
    return {"loss":loss.item(),"mae":mae.item()}

def val_action(*args,**kwargs):
    """
    A validation step
    Exactly the same like train step, but no learning, only forward pass
    """
    v_idx,u_idx,y = args[0]
    if CUDA:
        v_idx,u_idx,y = v_idx.cuda(),u_idx.cuda(),y.cuda().permute(1,0)
    y = y.float()
    
    y_ = cf_model(u_idx, v_idx)
    
    loss = loss_func(y_,y)
    mae = torch.mean(torch.abs(y_-y))
    return {"loss":loss.item(),"mae":mae.item()}


### K-fold validation training

In [16]:
K = 2 # two fold validation
BS = 4096
user_count = len(userId)
random = np.random.rand(user_count)

for fold in range(K):
    l.p("Training fold number",fold)
    l.p("making train/test split")
    valid_split = ((fold/K) < random)*(random <= ((fold+1)/K))
    train_idx = np.array(range(user_count))[~valid_split]
    valid_idx = np.array(range(user_count))[valid_split]

    train_df = rate_df[rate_df.userId.isin(train_idx)]
    valid_df = rate_df[rate_df.userId.isin(valid_idx)]
    
    l.p("generating pytorch dataset")
    train_ds = Arr_Dataset(
        train_df.movieIdx.values,
        train_df.userIdx.values,
        train_df.rating.values,
        bs = BS,
                      )
    valid_ds = Arr_Dataset(
        valid_df.movieIdx.values,
        valid_df.userIdx.values,
        valid_df.rating.values,
        bs = BS,
                      )
    
    # Model
    l.p("creating model")
    cf_model = cf_nn(hidden_size = DIM, user_size = len(userId),
               item_size = len(movieId))
    if CUDA:
        l.p("loading model to GPU")
        torch.cuda.empty_cache()
        cf_model.cuda()
    opt = Adam(cf_model.parameters())
    loss_func = nn.MSELoss()
    
    trainer = Trainer(train_ds, val_dataset=valid_ds, batch_size=1, print_on=5)
    
    train_len = len(trainer.train_data)
    valid_len = len(trainer.val_data)
    
    l.p("train_len",train_len)
    l.p("valid_len",valid_len)
    trainer.action  = action # assiging step function
    trainer.val_action  = val_action # assigning validate step function
    l.p("start training")
    trainer.train(3) # Train for 3 epochs
    l.p("training finished")
    torch.save(cf_model.state_dict(),"/data/cf_triditional_fold%s_0.0.1.npy"%(fold))

[Training fold number]<2018-10-23_15:03:10|30s,70s>	0
[making train/test split]<2018-10-23_15:03:10|0s,70s>	
[generating pytorch dataset]<2018-10-23_15:03:14|4s,74s>	
[creating model]<2018-10-23_15:03:14|0s,74s>	
[loading model to GPU]<2018-10-23_15:03:15|0s,75s>	


⭐[ep_0_i_9]	loss	0.079✨	mae	0.239:   0%|          | 6/2446 [00:00<00:42, 56.97it/s]

[train_len]<2018-10-23_15:03:17|2s,77s>	2446
[valid_len]<2018-10-23_15:03:17|0s,77s>	2438
[start training]<2018-10-23_15:03:17|0s,77s>	


⭐[ep_0_i_2444]	loss	0.031✨	mae	0.136: 100%|██████████| 2446/2446 [00:38<00:00, 64.25it/s]
😎[val_ep_0_i_2437]	loss	0.037😂	mae	0.150: 100%|██████████| 2438/2438 [00:34<00:00, 70.97it/s]
⭐[ep_1_i_2444]	loss	0.032✨	mae	0.140: 100%|██████████| 2446/2446 [00:38<00:00, 64.31it/s]
😎[val_ep_1_i_2437]	loss	0.036😂	mae	0.149: 100%|██████████| 2438/2438 [00:34<00:00, 70.91it/s]
⭐[ep_2_i_2444]	loss	0.038✨	mae	0.155: 100%|██████████| 2446/2446 [00:38<00:00, 64.30it/s]
😎[val_ep_2_i_2437]	loss	0.037😂	mae	0.150: 100%|██████████| 2438/2438 [00:34<00:00, 70.83it/s]


[training finished]<2018-10-23_15:03:55|217s,295s>	
[Training fold number]<2018-10-23_15:03:55|0s,295s>	1
[making train/test split]<2018-10-23_15:03:55|0s,295s>	
[generating pytorch dataset]<2018-10-23_15:03:59|3s,299s>	
[creating model]<2018-10-23_15:03:59|0s,299s>	
[loading model to GPU]<2018-10-23_15:03:59|0s,299s>	


⭐[ep_0_i_9]	loss	0.089✨	mae	0.255:   0%|          | 7/2438 [00:00<00:38, 62.71it/s]

[train_len]<2018-10-23_15:03:59|0s,299s>	2438
[valid_len]<2018-10-23_15:03:59|0s,299s>	2446
[start training]<2018-10-23_15:03:59|0s,299s>	


⭐[ep_0_i_2434]	loss	0.034✨	mae	0.145: 100%|██████████| 2438/2438 [00:37<00:00, 64.47it/s]
😎[val_ep_0_i_2445]	loss	0.037😂	mae	0.149: 100%|██████████| 2446/2446 [00:34<00:00, 70.87it/s]
⭐[ep_1_i_2434]	loss	0.029✨	mae	0.133: 100%|██████████| 2438/2438 [00:37<00:00, 64.55it/s]
😎[val_ep_1_i_2445]	loss	0.036😂	mae	0.148: 100%|██████████| 2446/2446 [00:34<00:00, 70.31it/s]
⭐[ep_2_i_2434]	loss	0.032✨	mae	0.138: 100%|██████████| 2438/2438 [00:37<00:00, 64.47it/s]
😎[val_ep_2_i_2445]	loss	0.036😂	mae	0.148: 100%|██████████| 2446/2446 [00:35<00:00, 35.49it/s]


[training finished]<2018-10-23_15:03:37|218s,517s>	
