## 20Mn data MovieLens Experiment

The original embedding+collaborative filtering + nn

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

In [2]:
from ray.lprint import lprint
l = lprint("experiment with CR on movielens 20m data to cross check the result against new technique")

[task:experiment with CR on movielens 20m data to cross check the result against new technique>>start]<2018-10-01_01:01:59|0s,0s>	


In [3]:
CUDA = torch.cuda.is_available()
SEQ_LEN = 19
DIM = 100
l.p("has GPU cuda",CUDA)

[has GPU cuda]<2018-10-01_01:01:00|0s,0s>	True


In [4]:
# %ls /data/ml-20m

In [5]:
DATA = "/data/ml-20m/ratings.csv"

In [6]:
l.p("loading csv file", DATA)
rate_df = pd.read_csv(DATA)
l.p("csv file loaded")

[loading csv file]<2018-10-01_01:01:02|1s,2s>	/data/ml-20m/ratings.csv
[csv file loaded]<2018-10-01_01:01:09|7s,9s>	


In [7]:
len(rate_df)

20000263

In [8]:
userId = list(set(rate_df["userId"]))
movieId = list(set(rate_df["movieId"]))
print("total number of users and movies:\t",len(userId),"\t",len(movieId))

total number of users and movies:	 138493 	 26744


In [9]:
l.p("making dictionary")
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

[making dictionary]<2018-10-01_01:01:12|2s,12s>	


In [10]:
rate_df["movieIdx"] = rate_df.movieId.apply(lambda x:m2i[x]).astype(int)
rate_df["userIdx"] = rate_df.userId.apply(lambda x:u2i[x]).astype(int)
rate_df["rating"] = rate_df["rating"]/5

In [11]:
l.p("making train/test split")
user_count = len(userId)
valid_split = np.random.rand(user_count)>.8
train_idx = np.array(range(user_count))[~valid_split]
valid_idx = np.array(range(user_count))[valid_split]

train_df = rate_df[rate_df.userId.isin(train_idx)]
valid_df = rate_df[rate_df.userId.isin(valid_idx)]

[making train/test split]<2018-10-01_01:01:28|16s,28s>	


In [12]:
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

In [13]:
pick_k = np.array([0]*SEQ_LEN +[1])==1

In [14]:
from ray.matchbox import Arr_Dataset

In [15]:
train_ds = Arr_Dataset(
    train_df.movieIdx.values,
    train_df.userIdx.values,
    train_df.rating.values,
    bs = 4096,
                      )
valid_ds = Arr_Dataset(
    valid_df.movieIdx.values,
    valid_df.userIdx.values,
    valid_df.rating.values,
    bs = 4096,
                      )

#### Testing Generator

In [16]:
dl = DataLoader(train_ds,batch_size=1, shuffle = True)
print(len(dl))
gen = iter(dl)
next(gen)[0].size()

3915


torch.Size([1, 4096])

### Model

In [17]:
class cf_nn(nn.Module):
    def __init__(self, hidden_size,user_size,item_size):
        """
        Collaborative Filtering with Neural Network
        """
        super(cf_nn,self).__init__()
        self.hidden_size = hidden_size
        self.user_size = user_size
        self.item_size = item_size
        self.emb_u = nn.Embedding(self.user_size,self.hidden_size,)
        self.emb_v = nn.Embedding(self.item_size,self.hidden_size,)
        
        self.mlp = nn.Sequential(*[
            nn.Dropout(.3),
            nn.Linear(hidden_size*2, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.Linear(256,1,bias=False),
            nn.Sigmoid(),
        ])
    
    def forward(self,u_idx,v_idx):
        cated = torch.cat([self.emb_u(u_idx.long().squeeze(0)),
                             self.emb_v(v_idx.long().squeeze(0))], dim=1)
        return self.mlp(cated)

In [18]:
l.p("creating model")
cf_model = cf_nn(hidden_size = DIM, user_size = len(userId),
               item_size = len(movieId))
if CUDA:
    l.p("loading model to GPU")
    torch.cuda.empty_cache()
    cf_model.cuda()
    
from ray.matchbox import Trainer
opt = Adam(cf_model.parameters())
loss_func = nn.MSELoss()
trainer = Trainer(train_ds, val_dataset=valid_ds, batch_size=1, print_on=5)

[creating model]<2018-10-01_01:01:32|3s,32s>	
[loading model to GPU]<2018-10-01_01:01:33|0s,33s>	


In [19]:
train_len = len(trainer.train_data)
valid_len = len(trainer.val_data)
l.p("train_len",train_len)
l.p("valid_len",valid_len)
def action(*args,**kwargs):
    # get data from data feeder
    v_idx,u_idx,y = args[0]
    if CUDA:
        v_idx,u_idx,y = v_idx.cuda(),u_idx.cuda(),y.cuda().permute(1,0)
    y = y.float()
    
    # Clear the Jacobian Matrix
    opt.zero_grad()
    
    # Predict y hat
    y_ = cf_model(u_idx, v_idx)
    # Calculate Loss
    loss = loss_func(y_,y)
    
    # Backward Propagation
    loss.backward()
    opt.step()
    # Mean Absolute Loss as print out metrics
    mae = torch.mean(torch.abs(y_-y))
    return {"loss":loss.item(),"mae":mae.item()}

def val_action(*args,**kwargs):
    """
    A validation step
    Exactly the same like train step, but no learning, only forward pass
    """
    v_idx,u_idx,y = args[0]
    if CUDA:
        v_idx,u_idx,y = v_idx.cuda(),u_idx.cuda(),y.cuda().permute(1,0)
    y = y.float()
    
    y_ = cf_model(u_idx, v_idx)
    
    loss = loss_func(y_,y)
    mae = torch.mean(torch.abs(y_-y))
    return {"loss":loss.item(),"mae":mae.item()}
trainer.action  = action
trainer.val_action  = val_action

[train_len]<2018-10-01_01:01:37|3s,37s>	3915
[valid_len]<2018-10-01_01:01:37|0s,37s>	969


In [20]:
l.p("start training")
trainer.train(2)
l.p("training finished")

⭐[ep_0_i_9]	loss	0.087✨	mae	0.254:   0%|          | 6/3915 [00:00<02:01, 32.09it/s]

[start training]<2018-10-01_01:01:37|0s,37s>	


⭐[ep_0_i_3914]	loss	0.034✨	mae	0.144: 100%|██████████| 3915/3915 [01:11<00:00, 54.63it/s]
😎[val_ep_0_i_968]	loss	0.036😂	mae	0.149: 100%|██████████| 969/969 [00:05<00:00, 162.22it/s]
⭐[ep_1_i_3914]	loss	0.042✨	mae	0.162: 100%|██████████| 3915/3915 [01:11<00:00, 54.73it/s]
😎[val_ep_1_i_968]	loss	0.036😂	mae	0.148: 100%|██████████| 969/969 [00:05<00:00, 164.80it/s]


[training finished]<2018-10-01_01:01:12|155s,192s>	


In [24]:
torch.save(cf_model.state_dict(),"/data/cf_0.0.1.npy")

In [23]:
trainer.train(1)

⭐[ep_0_i_3914]	loss	0.036✨	mae	0.146: 100%|██████████| 3915/3915 [01:11<00:00, 54.41it/s]
😎[val_ep_0_i_968]	loss	0.036😂	mae	0.148: 100%|██████████| 969/969 [00:07<00:00, 137.42it/s]
