# Replacing User Latent Vector With Behavioural Sequence RNN Output For CF Recommendation

##### For the situations that we * don't have fresh user embedding* 

#### Good side of user embedding
The latent space empowers the model to featurize the characters of user, entirely by learning.
#### Bad side of user embedding
Major problems of user embedding is always around the new user:

* We train a model with [latent cf + neural network](3.1.3_recommender_system.ipynb)
* Usually we won't retrain a model within the same day.
* If the new user some with his/her preference record, we have to train it in a way within the consistency of the old user's latent vector
* So we can't really apply the adavanced model to the fresh users.

In [1]:
import os
import pandas as pd
import numpy as np

Download the movielens data (yeah, I known, again)

In [2]:
# %cd /data
# !!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !!unzip ml-latest-small.zip

In [3]:
%ls /data/ml-latest-small/

README.txt   links.csv    movies.csv   ratings.csv  tags.csv


In [4]:
DATA = "/data/ml-latest-small/"
DIM = 100 # dimension for embedding
SEQ_LEN = 19

In [5]:
files = os.listdir(DATA)
files

['links.csv', 'tags.csv', 'ratings.csv', 'README.txt', 'movies.csv']

In [6]:
data = dict()
for f in files:
    if f[-3:]=="csv":
        data[f.split(".")[0]] = pd.read_csv(DATA+f)

### Check the data

In [7]:
from IPython.display import display
list(display(k,v.sample(5)) for k,v in data.items())

'links'

Unnamed: 0,movieId,imdbId,tmdbId
4938,7020,102721,14904.0
4119,5410,67756,811.0
7533,77846,118528,12219.0
7468,74630,52427,56931.0
2704,3391,94321,26827.0


'tags'

Unnamed: 0,userId,movieId,tag,timestamp
466,364,64957,Brad Pitt,1444531041
1219,547,118880,toplist14,1449757002
1244,547,134859,toplist15,1449755731
954,547,6515,tcm,1190475823
1193,547,112070,tivo,1476113970


'ratings'

Unnamed: 0,userId,movieId,rating,timestamp
68468,475,45666,4.0,1447327976
96315,642,1601,4.0,881526098
91484,607,1089,4.5,1118247427
87313,580,51255,4.0,1199493081
70822,494,34405,4.0,1342746088


'movies'

Unnamed: 0,movieId,title,genres
1268,1596,Career Girls (1997),Drama
1367,1732,"Big Lebowski, The (1998)",Comedy|Crime
3055,3822,"Girl on the Bridge, The (Fille sur le pont, La...",Drama|Romance
3703,4723,"Deep End, The (2001)",Drama
5811,26501,Choose Me (1984),Comedy|Romance


[None, None, None, None]

In [8]:
rate_df = data["ratings"]
len(rate_df)

100004

In [31]:
userId = list(set(data["ratings"]["userId"]))
movieId = list(set(data["ratings"]["movieId"]))
print("total number of users and movies:\t",len(userId),"\t",len(movieId))

total number of users and movies:	 671 	 9066


In [10]:
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

In [11]:
rate_df["movieIdx"] = rate_df.movieId.apply(lambda x:m2i[x]).astype(int)
rate_df["userIdx"] = rate_df.userId.apply(lambda x:u2i[x]).astype(int)
rate_df["rating"] = rate_df["rating"]/5

In [12]:
user_count = len(userId)
print(user_count)
valid_split = np.random.rand(user_count)>.9
train_idx = np.array(range(user_count))[~valid_split]
valid_idx = np.array(range(user_count))[valid_split]

671


In [13]:
train_df = rate_df[rate_df.userId.isin(train_idx)]
valid_df = rate_df[rate_df.userId.isin(valid_idx)]

Since user id mapping doesn't matter any more. 

It's easier to make a dataset with contineous user_id.

In [14]:
train_u2i = dict((v,k) for k,v in enumerate(set(train_df.userId)))
valid_u2i = dict((v,k) for k,v in enumerate(set(valid_df.userId)))
train_df["userId"] = train_df.userId.apply(lambda x:train_u2i[x])
valid_df["userId"] = valid_df.userId.apply(lambda x:valid_u2i[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [15]:
def get_user_trail(rate_df):
    return rate_df.sort_values(by=["userId","timestamp"]).groupby("userId")
    #gb.apply(lambda x:x.sample(n = 20, replace = False))
gb = get_user_trail(rate_df)
train_gb = get_user_trail(train_df)
valid_gb = get_user_trail(valid_df)

In [16]:
a = gb.apply(lambda x:x.sample(n = 20, replace = False))

In [17]:
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

In [18]:
pick_k = np.array([0]*19 +[1])==1

In [19]:
def sample_split(x):
    sampled = x.sample(n = 20, replace = False)
    seq = sampled.head(19).sort_values(by="timestamp")
    y = sampled[pick_k]
    return pd.concat([seq,y])

class rnn_record(Dataset):
    def __init__(self, gb):
        self.gb = gb
        self.make_seq()
    
    def make_seq(self):
        self.all_seq = self.gb.apply(sample_split)
        
    def __len__(self):
        return len(self.gb)
        
    def __getitem__(self,idx):
        df = self.all_seq.loc[idx]
        seq = df.head(19)[["movieIdx","rating"]].values
        targ = df[pick_k][["movieIdx","rating"]].values
        targ_v, targ_y =targ[:,0], targ[:,1]
        return idx,seq,targ_v,targ_y
ds = rnn_record(gb)
train_ds = rnn_record(train_gb)
valid_ds = rnn_record(valid_gb)

In [20]:
# dl = DataLoader(ds,batch_size=32,shuffle=True)
# gen = iter(dl)

# idx,seq,targ_v,targ_y = next(gen)

# idx.size(),seq.size(),targ_v.size(),targ_y.size()

### Model

In [21]:
import torch
from torch import nn
from torch.nn import functional as F

In [33]:
class mLinkNet(nn.Module):
    def __init__(self, hidden_size,v_size):
        """
        mLinkNet, short for missing link net
        """
        super(mLinkNet,self).__init__()
        self.hidden_size = hidden_size
        self.v_size = v_size
        self.emb = nn.Embedding(v_size,hidden_size)
        
        self.rnn = nn.GRU(input_size = self.hidden_size+1,
                          hidden_size= hidden_size+1,
                          num_layers=1,
                          batch_first = True,
                          dropout=0)
        
        self.mlp = nn.Sequential(*[
            nn.Dropout(.3),
            nn.Linear(hidden_size*2 + 1, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.Linear(256,1,bias=False),
            nn.Sigmoid(),
        ])
    
    def forward(self,seq,targ_v):
        seq_vec = torch.cat([self.emb(seq[:,0].long()),
                             seq[:,1].unsqueeze(-1).float()], dim=2)
        output, hn = self.rnn(seq_vec)
        x = torch.cat([hn.squeeze(0),self.emb(targ_v.long()).squeeze(1)],dim=1)
        return self.mlp(x)

In [23]:
mln = mLinkNet(hidden_size = DIM, v_size = len(movieId))

# mln(seq, targ_v)

In [24]:
from torch.optim import Adam
from ray.matchbox import Trainer
opt = Adam(mln.parameters())
loss_func = nn.MSELoss()

In [25]:
trainer = Trainer(train_ds, val_dataset=valid_ds, batch_size=16, print_on=3)

In [26]:
train_len = len(trainer.train_data)
valid_len = len(trainer.val_data)
def action(*args,**kwargs):
    # get data from data feeder
    idx,seq,targ_v,y = args[0]
    y = y.float()
    
    # Clear the Jacobian Matrix
    opt.zero_grad()
    
    # Predict y hat
    y_ = mln(seq, targ_v)
    # Calculate Loss
    loss = loss_func(y_,y)
    
    # Backward Propagation
    loss.backward()
    opt.step()
    # Mean Absolute Loss as print out metrics
    mae = torch.mean(torch.abs(y_-y))
    if kwargs["ite"] == train_len - 1: # resample the sequence
        trainer.train_data.dataset.make_seq()
    return {"loss":loss.item(),"mae":mae.item()}

def val_action(*args,**kwargs):
    """
    A validation step
    Exactly the same like train step, but no learning, only forward pass
    """
    idx,seq,targ_v,y = args[0]
    y = y.float()
    
    y_ = mln(seq, targ_v)
    
    loss = loss_func(y_,y)
    mae = torch.mean(torch.abs(y_-y))
    if kwargs["ite"] == valid_len - 1:
        trainer.val_data.dataset.make_seq()
    return {"loss":loss.item(),"mae":mae.item()}
trainer.action  = action
trainer.val_action  = val_action

In [27]:
trainer.train(50)

⭐[ep_0_i_35]	loss	0.065✨	mae	0.221: 100%|██████████| 38/38 [00:03<00:00, 11.48it/s]
😎[val_ep_0_i_4]	loss	0.069😂	mae	0.216: 100%|██████████| 5/5 [00:00<00:00, 13.04it/s]
⭐[ep_1_i_35]	loss	0.076✨	mae	0.235: 100%|██████████| 38/38 [00:03<00:00, 11.72it/s]
😎[val_ep_1_i_4]	loss	0.048😂	mae	0.178: 100%|██████████| 5/5 [00:00<00:00, 15.16it/s]
⭐[ep_2_i_35]	loss	0.058✨	mae	0.193: 100%|██████████| 38/38 [00:02<00:00, 13.42it/s]
😎[val_ep_2_i_4]	loss	0.069😂	mae	0.208: 100%|██████████| 5/5 [00:00<00:00, 15.47it/s]
⭐[ep_3_i_35]	loss	0.048✨	mae	0.160: 100%|██████████| 38/38 [00:02<00:00, 12.90it/s]
😎[val_ep_3_i_4]	loss	0.037😂	mae	0.150: 100%|██████████| 5/5 [00:00<00:00, 14.24it/s]
⭐[ep_4_i_35]	loss	0.048✨	mae	0.168: 100%|██████████| 38/38 [00:02<00:00, 13.37it/s]
😎[val_ep_4_i_4]	loss	0.033😂	mae	0.146: 100%|██████████| 5/5 [00:00<00:00, 15.08it/s]
⭐[ep_5_i_35]	loss	0.036✨	mae	0.146: 100%|██████████| 38/38 [00:02<00:00, 13.11it/s]
😎[val_ep_5_i_4]	loss	0.058😂	mae	0.184: 100%|██████████| 5/5 [00:00<00:0