##Import libraries

In [7]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import time

In [39]:
#Get the data (session_id, item_id, date, Datetime, Timestamp)
url01 = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/session-based-recsys-fashion/main/dataset_filtered/train_session01_seq.csv'

Train/Test split

In [40]:
dataset01 = pd.read_csv(url01, index_col = 0, parse_dates=["date"])
dataset01 = dataset01.dropna()
dataset01 = dataset01.reset_index()
#fraction
dataset = dataset01.sample(frac=0.6)

In [None]:
dataset.shape

In [41]:
dataset.head(5)

Unnamed: 0,session_id,item_id,date,timestamp,month,weekYear,season,duration
439084,3771095.0,9687.0,2020-09-25 21:47:21.686,1601070000.0,9.0,39.0,3.0,1.0
461152,3961232.0,14858.0,2021-01-23 20:30:51.014,1611434000.0,1.0,3.0,4.0,1.0
228847,1965170.0,23139.0,2021-01-21 10:18:20.990,1611224000.0,1.0,3.0,4.0,1.0
44808,385278.0,20952.0,2020-12-11 17:47:03.767,1607709000.0,12.0,50.0,4.0,1.0
82672,711366.0,6187.0,2021-04-26 20:25:04.436,1619469000.0,4.0,17.0,1.0,1.0


In [43]:
# Thống kê số lượt xuất hiện và lọc ra các ItemId có trên 5 lượt xuất hiện
df_item_count = dataset[['item_id', 'session_id']].groupby('item_id').count().sort_values(by = 'session_id', ascending = False)
df_item_count.columns = ['CountItemId']
df_item_count_5 = df_item_count[df_item_count['CountItemId'] < 5]
# Lọc khỏi dataset những ItemId có ít hơn 5 lượt xuất hiện
dataset = dataset[~dataset['item_id'].isin(list(df_item_count_5.index))]

In [44]:
#Train split data test
random_selection = np.random.rand(len(dataset.index)) <= 0.85
train_data = dataset[random_selection]
test_data = dataset[~random_selection]

In [45]:
# Lấy ra dictionary có dạng {SessionId:{ItemId1:Timestamp1, ItemId2:Timestamp2, ...}}
train_sess = train_data[['session_id', 'item_id', 'timestamp']].groupby('session_id').apply(lambda x: dict(zip(x['item_id'], x['timestamp'])))
test_sess = test_data[['session_id', 'item_id', 'timestamp']].groupby('session_id').apply(lambda x: dict(zip(x['item_id'], x['timestamp'])))

In [46]:
train_sess

session_id
18.0                                  {4026.0: 1598469347.232}
108.0                                 {4816.0: 1591462767.995}
154.0                                {21152.0: 1587713021.545}
170.0                                {20691.0: 1583672243.604}
181.0                                 {16417.0: 1585940777.84}
                                   ...                        
4439942.0                            {17609.0: 1600789639.946}
4439964.0                            {11397.0: 1585037643.053}
4439973.0                            {21328.0: 1604064770.528}
4439990.0    {17429.0: 1598096360.419, 22093.0: 1598099595....
4440001.0    {25129.0: 1604076172.451, 25273.0: 1604100295....
Length: 206781, dtype: object

#Preprocessing data

In [47]:
sessDict = {214834865: '1396808691.295000', 214706441: '1396808691.426000', 214820225: '1396808691.422000'}

def _preprocess_sess_dict(sessDict):
    sessDictTime = dict([(v, k) for (k, v) in sessDict.items()])
    sessSort = sorted(sessDictTime.items(), reverse = False)
    times = [item[0] for item in sessSort]
    itemIds = [item[1] for item in sessSort]
    inp_seq = []
    labels = []
    inp_time = []

    for i in range(len(sessSort)):
        if i >= 1:
            inp_seq += [itemIds[:i]]
            labels += [itemIds[i]]
            inp_time += [times[i]]
    return inp_seq, inp_time, labels, itemIds

inp_seq, inp_time, labels, itemIds = _preprocess_sess_dict(sessDict)
print('input sequences: ', inp_seq)
print('input times: ', inp_time)
print('targets: ', labels)
print('sequence: ', itemIds)

input sequences:  [[214834865], [214834865, 214820225]]
input times:  ['1396808691.422000', '1396808691.426000']
targets:  [214820225, 214706441]
sequence:  [214834865, 214820225, 214706441]


In [48]:
#Khởi tạo chuỗi input và output cho toàn bộ các session
def _preprocess_data(data_sess):
    inp_seqs = []
    inp_times = []
    labels = []
    sequences = []
    sessIds = list(data_sess.index)
    for sessId in sessIds:
        sessDict = data_sess.loc[sessId]
        inp_seq, inp_time, label, sequence = _preprocess_sess_dict(sessDict)
        inp_seqs += inp_seq
        inp_times += inp_time
        labels += label
        sequences += sequence
    return inp_seqs, inp_times, labels, sequences

train_inp_seqs, train_inp_dates, train_labs, train_sequences = _preprocess_data(train_sess)
test_inp_seqs, test_inp_dates, test_labs, test_sequences = _preprocess_data(test_sess)

train = (train_inp_seqs, train_labs)
test = (test_inp_seqs, test_labs)

print('Done.')

Done.


Lưu dữ liệu train/test

In [49]:
import pickle
import os

def _save_file(filename, obj):
  with open(filename, 'wb') as fn:
    pickle.dump(obj, fn)

# Tạo folder yoochoose-data-4 để lưu dữ liệu train/test nếu chưa tồn tại
if not os.path.exists('dressipi_data_train0.2'):
  os.mkdir('dressipi_data_train0.2')

# Lưu train/test
_save_file('dressipi_data_train0.2/train02.pkl', train)
_save_file('dressipi_data_train0.2/test02.pkl', test)

In [50]:
import pickle

def _load_file(filename):
  with open(filename, 'rb') as fn:
    data = pickle.load(fn)
  return data

# Load dữ liệu train/test từ folder
train = _load_file('dressipi_data_train0.2/train02.pkl')
test = _load_file('dressipi_data_train0.2/test02.pkl')

# Build vocabulary

In [51]:
# Các token default
PAD_token = 0  # token padding cho câu ngắn

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.item2index = {}
        self.item2count = {}
        self.index2item = {PAD_token: "PAD"}
        self.num_items = 1  # số lượng mặc định ban đầu là 1 ứng với PAD_token

    def addSenquence(self, data):
        for sequence in data:
          for item in sequence:
              self.addItem(item)

    # Thêm một item vào hệ thống
    def addItem(self, item):
        if item not in self.item2index:
            self.item2index[item] = self.num_items
            self.item2count[item] = 1
            self.index2item[self.num_items] = item
            self.num_items += 1
        else:
            self.item2count[item] += 1

    # Loại các item dưới ngưỡng xuất hiện min_count
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_items = []

        for k, v in self.item2count.items():
            if v >= min_count:
                keep_items.append(k)

        print('keep_items {} / {} = {:.4f}'.format(
            len(keep_items), len(self.item2index), len(keep_items) / len(self.item2index)
        ))

        # Khởi tạo lại từ điển
        self.item2index = {}
        self.item2count = {}
        self.index2item = {PAD_token: "PAD"}
        self.num_items = 1

        # Thêm các items vào từ điển
        for item in keep_items:
            self.addItem(item)

    # Hàm convert sequence về chuỗi các indices
    def _seqItem2seqIndex(self, x):
        return [voc.item2index[item] if item in voc.item2index else 0 for item in x]

In [52]:
#Lấy toàn bộ list các itemIds trong các session
from itertools import chain
seq_targets = [train[1]] + [test[1]]
sessionIds = list(chain.from_iterable(seq_targets))
sessionIds = set(sessionIds)
print('Number of sessionIds: ', len(sessionIds))

Number of sessionIds:  11352


In [53]:
#Khởi tạo vocabulary cho bộ dữ liệu
voc = Voc('DictItemId')
voc.addSenquence(seq_targets)

# Convert thử nghiệm một sequence itemIds
print('sequence of itemIds: ', train[0][7])
print('converted indices: ', voc._seqItem2seqIndex(train[0][7]))

sequence of itemIds:  [2961.0]
converted indices:  [6801]


In [54]:
#Chuyển dữ liệu train, test từ item sang indices của item
train_x_index = [voc._seqItem2seqIndex(seq) for seq in train[0]]
test_x_index = [voc._seqItem2seqIndex(seq) for seq in test[0]]
train_y_index = voc._seqItem2seqIndex(train[1])
test_y_index = voc._seqItem2seqIndex(test[1])
train_index = (train_x_index, train_y_index)
test_index = (test_x_index, test_y_index)

Phân chia tập Train/Test/Validation

In [55]:
def load_data(root='', valid_portion=0.1, maxlen=19, sort_by_len=False, train_set=None, test_set=None):
    """Load dataset từ root
    root: folder dữ liệu train, trong trường hợp train_set, test_set tồn tại thì không sử dụng train_set và test_set
    valid_portion: tỷ lệ phân chia dữ liệu validation/train
    maxlen: độ dài lớn nhất của sequence
    sort_by_len: có sort theo chiều dài các session trước khi chia hay không?
    train_set: training dataset
    test_set:  test dataset
    """

    # Load the dataset
    if train_set is None and test_set is None:
        path_train_data = os.path.join(root, 'train.pkl')
        path_test_data = os.path.join(root, 'test.pkl')
        with open(path_train_data, 'rb') as f1:
            train_set = pickle.load(f1)

        with open(path_test_data, 'rb') as f2:
            test_set = pickle.load(f2)

    if maxlen:
        new_train_set_x = []
        new_train_set_y = []
        # Lọc dữ liệu sequence đến maxlen
        for x, y in zip(train_set[0], train_set[1]):
            if len(x) < maxlen:
                new_train_set_x.append(x)
                new_train_set_y.append(y)
            else:
                new_train_set_x.append(x[:maxlen])
                new_train_set_y.append(y)
        train_set = (new_train_set_x, new_train_set_y)
        del new_train_set_x, new_train_set_y

        new_test_set_x = []
        new_test_set_y = []
        for xx, yy in zip(test_set[0], test_set[1]):
            if len(xx) < maxlen:
                new_test_set_x.append(xx)
                new_test_set_y.append(yy)
            else:
                new_test_set_x.append(xx[:maxlen])
                new_test_set_y.append(yy)
        test_set = (new_test_set_x, new_test_set_y)
        del new_test_set_x, new_test_set_y

    # phân chia tập train thành train và validation
    train_set_x, train_set_y = train_set
    n_samples = len(train_set_x)
    sidx = np.arange(n_samples, dtype='int32')
    np.random.shuffle(sidx)
    n_train = int(np.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    (test_set_x, test_set_y) = test_set

    # Trả về indices thứ tự độ dài của mỗi phần tử trong seq
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    # Sắp xếp session theo độ dài tăng dần
    if sort_by_len:
        sorted_index = len_argsort(test_set_x)
        test_set_x = [test_set_x[i] for i in sorted_index]
        test_set_y = [test_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(valid_set_x)
        valid_set_x = [valid_set_x[i] for i in sorted_index]
        valid_set_y = [valid_set_y[i] for i in sorted_index]

    train = (train_set_x, train_set_y)
    valid = (valid_set_x, valid_set_y)
    test = (test_set_x, test_set_y)
    return train, valid, test

Data Loader

In [56]:
from torch.utils.data import Dataset

class RecSysDataset(Dataset):
    """define the pytorch Dataset class for yoochoose and diginetica datasets.
    """
    def __init__(self, data):
        self.data = data
        print('-'*50)
        print('Dataset info:')
        print('Number of sessions: {}'.format(len(data[0])))
        print('-'*50)

    def __getitem__(self, index):
        session_items = self.data[0][index]
        target_item = self.data[1][index]
        return session_items, target_item

    def __len__(self):
        return len(self.data[0])

In [57]:
#Hàm phụ trợ
import torch

def collate_fn(data):
    """
    Hàm số này sẽ được sử dụng để pad session về max length
    Args:
      data: batch truyền vào
    return:
      batch data đã được pad length có shape maxlen x batch_size
    """
    # Sort batch theo độ dài của input_sequence từ cao xuống thấp
    data.sort(key=lambda x: len(x[0]), reverse=True)
    lens = [len(sess) for sess, label in data]
    labels = []
    # Padding batch size
    padded_sesss = torch.zeros(len(data), max(lens)).long()
    for i, (sess, label) in enumerate(data):
        padded_sesss[i,:lens[i]] = torch.LongTensor(sess)
        labels.append(label)

    # Transpose dữ liệu từ batch_size x maxlen --> maxlen x batch_size
    padded_sesss = padded_sesss.transpose(0,1)
    return padded_sesss, torch.tensor(labels).long(), lens

#Metric

In [58]:
import torch

def get_recall(indices, targets):
    """
    Tính toán chỉ số recall cho một tập hợp predictions và targets
    Args:
        indices (Bxk): torch.LongTensor. top-k indices được dự báo từ mô hình model.
        targets (B): torch.LongTensor. actual target indices.
    Returns:
        recall (float): the recall score
    """
    # copy targets k lần để trở thành kích thước Bxk
    targets = targets.view(-1, 1).expand_as(indices)
    # so sánh targets với indices để tìm ra vị trí mà khách hàng sẽ hit.
    hits = (targets == indices).to(device)
    hits = hits.double()
    if targets.size(0) == 0:
        return 0
    # Đếm số hit
    n_hits = torch.sum(hits)
    recall = n_hits / targets.size(0)
    return recall


def get_mrr(indices, targets):
    """
    Tính toán chỉ số MRR cho một tập hợp predictions và targets
    Args:
        indices (Bxk): torch.LongTensor. top-k indices được dự báo từ mô hình model.
        targets (B): torch.LongTensor. actual target indices.
    Returns:
        recall (float): the MRR score
    """
    tmp = targets.view(-1, 1)
    targets = tmp.expand_as(indices)
    hits = (targets == indices).to(device)
    hits = hits.double()
    if hits.sum() == 0:
      return 0
    argsort = []
    for i in np.arange(hits.shape[0]):
      index_col = torch.where(hits[i, :] == 1)[0]+1
      if index_col.shape[0] != 0:
        argsort.append(index_col.double())
    inv_argsort = [1/item for item in argsort]
    mrr = sum(inv_argsort)/hits.shape[0]
    return mrr


def evaluate(logits, targets, k=20):
    """
    Đánh giá model sử dụng Recall@K, MRR@K scores.
    Args:
        logits (B,C): torch.LongTensor. giá trị predicted logit cho itemId tiếp theo.
        targets (B): torch.LongTensor. actual target indices.
    Returns:
        recall (float): the recall score
        mrr (float): the mrr score
    """
    # Tìm ra indices của topk lớn nhất các giá trị dự báo.
    _, indices = torch.topk(logits, k, -1)
    recall = get_recall(indices, targets)
    mrr = get_mrr(indices, targets)
    return recall, mrr

In [59]:
import torch
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

logits = torch.tensor([[0.1, 0.2, 0.7],
                       [0.4, 0.1, 0.5],
                       [0.1, 0.2, 0.7]]).to(device)

targets = torch.tensor([1, 2, 2]).to(device)

evaluate(logits = logits, targets = targets, k = 2)

(tensor(1., dtype=torch.float64), tensor([0.8333], dtype=torch.float64))

Model NARM

In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class NARM(nn.Module):
    def __init__(self, hidden_size, n_items, embedding_dim, n_layers=1, dropout=0.25):
        super(NARM, self).__init__()
        self.hidden_size = hidden_size
        self.n_items = n_items
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(self.n_items, self.embedding_dim, padding_idx = 0)
        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        # set bidirectional = True for bidirectional
        # https://pytorch.org/docs/stable/nn.html?highlight=gru#torch.nn.GRU to get more information
        self.gru = nn.GRU(input_size = hidden_size, # number of expected feature of input x
                          hidden_size = hidden_size, # number of expected feature of hidden state
                          num_layers = n_layers, # number of GRU layers
                          dropout=(0 if n_layers == 1 else dropout), # dropout probability apply in encoder network
                          bidirectional=True # one or two directions.
                         )
        self.emb_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(self.embedding_dim, self.hidden_size, self.n_layers)
        self.a_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.a_2 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.v_t = nn.Linear(self.hidden_size, 1, bias=False)
        self.ct_dropout = nn.Dropout(0.5)
        self.b = nn.Linear(self.embedding_dim, 2 * self.hidden_size, bias=False)
        self.sf = nn.Softmax()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def forward(self, input_seq, input_lengths, hidden=None):
        """
        input_seq: Batch input_sequence. Shape: max_len x batch_size
        input_lengths: Batch input lengths. Shape: batch_size
        """
        # Step 1: Convert sequence indexes to embeddings
        # shape: (max_length , batch_size , hidden_size)
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module. Padding zero when length less than max_length of input_lengths.
        # shape: (max_length , batch_size , hidden_size)
        packed = pack_padded_sequence(embedded, input_lengths)

        # Step 2: Forward packed through GRU
        # outputs is output of final GRU layer
        # hidden is concatenate of all hidden states corresponding with each time step.
        # outputs shape: (max_length , batch_size , hidden_size x num_directions)
        # hidden shape: (n_layers x num_directions , batch_size , hidden_size)
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding. Revert of pack_padded_sequence
        # outputs shape: (max_length , batch_size , hidden_size x num_directions)
        outputs, length = pad_packed_sequence(outputs)

        # Step 3: Global Encoder & Local Encoder
        # num_directions = 1 -->
        # outputs shape:(max_length , batch_size , hidden_size)
        # hidden shape: (n_layers , batch_size , hidden_size)
        # lấy hidden state tại time step cuối cùng
        ht = hidden[-1]
        # reshape outputs
        outputs = outputs.permute(1, 0, 2) # [batch_size, max_length, hidden_size]
        c_global = ht
        # Flatten outputs thành shape: [batch_size, max_length, hidden_size]
        gru_output_flatten = outputs.contiguous().view(-1, self.hidden_size)
        # Thực hiện một phép chiếu linear projection để tạo các latent variable có shape [batch_size, max_length, hidden_size]
        q1 = self.a_1(gru_output_flatten).view(outputs.size())
        # Thực hiện một phép chiếu linear projection để tạo các latent variable có shape [batch_size, max_length, hidden_size]
        q2 = self.a_2(ht)
        # Ma trận mask đánh dấu vị trí khác 0 trên padding sequence.
        mask = torch.where(input_seq.permute(1, 0) > 0, torch.tensor([1.], device = self.device), torch.tensor([0.], device = self.device)) # batch_size x max_len
        # Điều chỉnh shape
        q2_expand = q2.unsqueeze(1).expand_as(q1) # shape [batch_size, max_len, hidden_size]
        q2_masked = mask.unsqueeze(2).expand_as(q1) * q2_expand # batch_size x max_len x hidden_size
        # Tính trọng số alpha đo lường similarity giữa các hidden state
        alpha = self.v_t(torch.sigmoid(q1 + q2_masked).view(-1, self.hidden_size)).view(mask.size()) # batch_size x max_len
        alpha_exp = alpha.unsqueeze(2).expand_as(outputs) # batch_size x max_len x hidden_size
        # Tính linear combinition của các hidden state
        c_local = torch.sum(alpha_exp * outputs, 1) # (batch_size x hidden_size)

        # Véc tơ combinition tổng hợp
        c_t = torch.cat([c_local, c_global], 1) # batch_size x (2*hidden_size)
        c_t = self.ct_dropout(c_t)
        # Tính scores

        # Step 4: Decoder
        # embedding cho toàn bộ các item
        item_indices = torch.arange(self.n_items).to(device) # 1 x n_items
        item_embs = self.embedding(item_indices) # n_items x embedding_dim
        # reduce dimension by bi-linear projection
        B = self.b(item_embs).permute(1, 0) # (2*hidden_size) x n_items
        scores = torch.matmul(c_t, B) # batch_size x n_items
        # scores = self.sf(scores)
        return scores

In [61]:
#Kiểm tra model NARM
# Thử nghiệm model bằng cách giả lập 1 input và thực hiện quá trình feed forward
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_size = 3
n_layers = 7
# embedding = nn.Embedding(11000, hidden_size)
input_variable = torch.tensor([[  66,  369,   66, 1272],
                                [ 567,  183,   28,  616],
                                [ 392, 1558, 1143,  175],
                                [ 394,   31,   31, 5558],
                                [   0,    0,    0,    0]]).to(device)

lengths =  torch.tensor([5, 5, 5, 5]).to(device)
print('input_seq: \n', input_variable)
print('input_lengths: \n', lengths)
model_test = NARM(hidden_size = hidden_size, n_items  = 100000, embedding_dim = 100, n_layers=1, dropout=0.25).to(device)
print('model phrase: \n', model_test)
scores = model_test.forward(input_seq = input_variable, input_lengths = lengths)
print('probability distribution: ', scores.shape)

input_seq: 
 tensor([[  66,  369,   66, 1272],
        [ 567,  183,   28,  616],
        [ 392, 1558, 1143,  175],
        [ 394,   31,   31, 5558],
        [   0,    0,    0,    0]])
input_lengths: 
 tensor([5, 5, 5, 5])
model phrase: 
 NARM(
  (embedding): Embedding(100000, 100, padding_idx=0)
  (gru): GRU(100, 3)
  (emb_dropout): Dropout(p=0.25, inplace=False)
  (a_1): Linear(in_features=3, out_features=3, bias=False)
  (a_2): Linear(in_features=3, out_features=3, bias=False)
  (v_t): Linear(in_features=3, out_features=1, bias=False)
  (ct_dropout): Dropout(p=0.5, inplace=False)
  (b): Linear(in_features=100, out_features=6, bias=False)
  (sf): Softmax(dim=None)
)
probability distribution:  torch.Size([4, 100000])


#Validation

In [62]:
def validate(valid_loader, model):
    model.eval()
    recalls = []
    mrrs = []
    with torch.no_grad():
        for seq, target, lens in valid_loader:
            seq = seq.to(device)
            target = target.to(device)
            outputs = model(seq, lens)
            logits = F.softmax(outputs, dim = 1)
            recall, mrr = evaluate(logits, target, k = args['topk'])
            recalls.append(recall)
            mrrs.append(mrr)

    mean_recall = torch.mean(torch.stack(recalls))
    # mean_mrr = torch.mean(torch.stack(mrrs))
    return mean_recall #, mean_mrr

#Training model

In [63]:
import os
import time
import random
import argparse
import pickle
import numpy as np
from tqdm import tqdm
from os.path import join

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.autograd import Variable
from torch.backends import cudnn

args = {
    'dataset_path':'../input/dressipi_recsys2022_dataset/train_sessions.csv',
    'batch_size': 256,
    'hidden_size': 100,
    'embed_dim': 50,
    'epoch': 5,
    'lr':0.01,
    'lr_dc':0.1,
    'lr_dc_step':80,
    'test':None,
    'topk':100,
    'valid_portion':0.1
}

here = os.path.dirname(os.getcwd())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def main():
    print('Loading data...')
    train_data, valid_data, test_data = load_data(train_set=train_index, test_set=test_index)
    train_data = RecSysDataset(train_data)
    valid_data = RecSysDataset(valid_data)
    test_data = RecSysDataset(test_data)
    train_loader = DataLoader(train_data, batch_size = args['batch_size'], shuffle = True, collate_fn = collate_fn)
    valid_loader = DataLoader(valid_data, batch_size = args['batch_size'], shuffle = False, collate_fn = collate_fn)
    test_loader = DataLoader(test_data, batch_size = args['batch_size'], shuffle = False, collate_fn = collate_fn)
    print('Complete load data!')
    n_items = voc.num_items
    model = NARM(hidden_size = args['hidden_size'], n_items = n_items, embedding_dim = args['embed_dim'], n_layers=2, dropout=0.25).to(device)
    print('complete load model!')

    if args['test'] == 'store_true':
        ckpt = torch.load('latest_checkpoint.pth.tar')
        model.load_state_dict(ckpt['state_dict'])
        recall, mrr = validate(test_loader, model)
        print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args['topk'], recall, args['topk'], mrr))
        return model

    optimizer = optim.Adam(model.parameters(), args['lr'])
    criterion = nn.CrossEntropyLoss()
    scheduler = StepLR(optimizer, step_size = args['lr_dc_step'], gamma = args['lr_dc'])

    print('start training!')
    for epoch in tqdm(range(args['epoch'])):
        # train for one epoch
        trainForEpoch(train_loader, model, optimizer, epoch, args['epoch'], criterion, log_aggr = 1000)
        scheduler.step(epoch = epoch)
        recall = validate(valid_loader, model)
        print('Epoch {} validation: Recall@{}: {:.4f}\n'.format(epoch, args['topk'], recall, args['topk']))

        # store best loss and save a model checkpoint
        ckpt_dict = {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }

        torch.save(ckpt_dict, 'latest_checkpoint.pth.tar')
    return model


def trainForEpoch(train_loader, model, optimizer, epoch, num_epochs, criterion, log_aggr=1000):
    model.train()

    sum_epoch_loss = 0

    start = time.time()
    for i, (seq, target, lens) in enumerate(train_loader):
        seq = seq.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        outputs = model(seq, lens)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

        loss_val = loss.item()
        sum_epoch_loss += loss_val

        iter_num = epoch * len(train_loader) + i + 1

        if i % log_aggr == 0:
            print('[TRAIN] epoch %d/%d  observation %d/%d batch loss: %.4f (avg %.4f) (%.2f im/s)'
                % (epoch + 1, num_epochs, i, len(train_loader), loss_val, sum_epoch_loss / (i + 1),
                  len(seq) / (time.time() - start)))

        start = time.time()

model = main()

Loading data...
--------------------------------------------------
Dataset info:
Number of sessions: 40723
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 4525
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 1779
--------------------------------------------------
Complete load data!
complete load model!
start training!


  0%|          | 0/5 [00:00<?, ?it/s]

[TRAIN] epoch 1/5  observation 0/160 batch loss: 9.6058 (avg 9.6058) (25.00 im/s)


 20%|██        | 1/5 [00:20<01:22, 20.63s/it]

Epoch 0 validation: Recall@100: 0.0277

[TRAIN] epoch 2/5  observation 0/160 batch loss: 9.1972 (avg 9.1972) (28.62 im/s)


 40%|████      | 2/5 [00:40<01:00, 20.21s/it]

Epoch 1 validation: Recall@100: 0.0568

[TRAIN] epoch 3/5  observation 0/160 batch loss: 8.9948 (avg 8.9948) (44.70 im/s)


 60%|██████    | 3/5 [01:01<00:40, 20.35s/it]

Epoch 2 validation: Recall@100: 0.0587

[TRAIN] epoch 4/5  observation 0/160 batch loss: 8.7970 (avg 8.7970) (53.42 im/s)


 80%|████████  | 4/5 [01:21<00:20, 20.35s/it]

Epoch 3 validation: Recall@100: 0.0576

[TRAIN] epoch 5/5  observation 0/160 batch loss: 8.5046 (avg 8.5046) (44.31 im/s)


100%|██████████| 5/5 [01:43<00:00, 20.72s/it]

Epoch 4 validation: Recall@100: 0.0593






In [64]:
os.listdir()
# chứa 'latest_checkpoint.pth.tar'

['.config',
 'dressipi_data_train0.2',
 'latest_checkpoint.pth.tar',
 'sample_data']

In [66]:
import torch
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PATH = 'latest_checkpoint.pth.tar'
model = NARM(hidden_size = args['hidden_size'], n_items = 13353, embedding_dim = args['embed_dim'], n_layers=2, dropout=0.25).to(device)
optimizer = optim.Adam(params = model.parameters(), lr=0.001)

checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
epoch = checkpoint['epoch']

model.eval()

RuntimeError: Error(s) in loading state_dict for NARM:
	size mismatch for embedding.weight: copying a param with shape torch.Size([11353, 50]) from checkpoint, the shape in current model is torch.Size([13353, 50]).

In [67]:
# Lựa chọn ngẫu nhiên một session trên test
import numpy as np
i = np.random.randint(0, len(test_index[0]))
x = [test_index[0][i]]
y = [test_index[1][i]]
print('item indexes sequence input: ', x)
print('item index next output: ', y)

item indexes sequence input:  [[834]]
item index next output:  [1408]


In [68]:
# Step 1: Khởi tạo test_loader để biến đổi dữ liệu session đưa vào mô hình
test_data = RecSysDataset([x, y])
test_loader = DataLoader(test_data, batch_size = args['batch_size'], shuffle = False, collate_fn = collate_fn)

# Step 2: Dự báo các indice tiếp theo mà khách hàng có khả năng click
def _preddict(loader, model):
    model.eval()
    recalls = []
    mrrs = []
    j = 1
    with torch.no_grad():
      for seq, target, lens in loader:
        seq = seq.to(device)
        target = target.to(device)
        outputs = model(seq, lens)
        logits = F.softmax(outputs, dim = 1)
        _, indices = torch.topk(logits, 20, -1)
        print('Is next clicked item in top 20 suggestions: ', (target in indices))
        print('Top 20 next item indices suggested: ')
    return indices

_preddict(test_loader, model)

--------------------------------------------------
Dataset info:
Number of sessions: 1
--------------------------------------------------
Is next clicked item in top 20 suggestions:  False
Top 20 next item indices suggested: 


tensor([[ 6377,   372,  1074,   691,  3560,  9063,  6265, 11140, 12450, 12327,
           949,   241,  8819, 11417, 12382,   263,  5045,  3197, 10068,  2561]])