In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pickle
import torch
from torch.utils.data import Dataset
import numpy as np


def load_data(root, valid_portion=0.1, maxlen=19, sort_by_len=False):
    '''Loads the dataset

    :type path: String
    :param path: The path to the dataset (here RSC2015)
    :type n_items: int
    :param n_items: The number of items.
    :type valid_portion: float
    :param valid_portion: The proportion of the full train set used for
        the validation set.
    :type maxlen: None or positive int
    :param maxlen: the max sequence length we use in the train/valid set.
    :type sort_by_len: bool
    :name sort_by_len: Sort by the sequence lenght for the train,
        valid and test set. This allow faster execution as it cause
        less padding per minibatch. Another mechanism must be used to
        shuffle the train set at each epoch.

    '''

    # Load the dataset
    path_train_data = root + 'train.txt'
    path_test_data = root + 'test.txt'
    with open(path_train_data, 'rb') as f1:
        train_set = pickle.load(f1)

    with open(path_test_data, 'rb') as f2:
        test_set = pickle.load(f2)

    if maxlen:
        new_train_set_x = []
        new_train_set_y = []
        for x, y in zip(train_set[0], train_set[1]):
            if len(x) < maxlen:
                new_train_set_x.append(x)
                new_train_set_y.append(y)
            else:
                new_train_set_x.append(x[:maxlen])
                new_train_set_y.append(y)
        train_set = (new_train_set_x, new_train_set_y)
        del new_train_set_x, new_train_set_y

        new_test_set_x = []
        new_test_set_y = []
        for xx, yy in zip(test_set[0], test_set[1]):
            if len(xx) < maxlen:
                new_test_set_x.append(xx)
                new_test_set_y.append(yy)
            else:
                new_test_set_x.append(xx[:maxlen])
                new_test_set_y.append(yy)
        test_set = (new_test_set_x, new_test_set_y)
        del new_test_set_x, new_test_set_y

    # split training set into validation set
    train_set_x, train_set_y = train_set
    n_samples = len(train_set_x)
    sidx = np.arange(n_samples, dtype='int32')
    np.random.shuffle(sidx)
    n_train = int(np.round(n_samples * (1. - valid_portion)))
    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    (test_set_x, test_set_y) = test_set

    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    if sort_by_len:
        sorted_index = len_argsort(test_set_x)
        test_set_x = [test_set_x[i] for i in sorted_index]
        test_set_y = [test_set_y[i] for i in sorted_index]

        sorted_index = len_argsort(valid_set_x)
        valid_set_x = [valid_set_x[i] for i in sorted_index]
        valid_set_y = [valid_set_y[i] for i in sorted_index]

    train = (train_set_x, train_set_y)
    valid = (valid_set_x, valid_set_y)
    test = (test_set_x, test_set_y)

    return train, valid, test


class RecSysDataset(Dataset):
    """define the pytorch Dataset class for yoochoose and diginetica datasets.
    """
    def __init__(self, data):
        self.data = data
        print('-'*50)
        print('Dataset info:')
        print('Number of sessions: {}'.format(len(data[0])))
        print('-'*50)
        
    def __getitem__(self, index):
        session_items = self.data[0][index]
        target_item = self.data[1][index]
        return session_items, target_item

    def __len__(self):

        return len(self.data[0])

In [3]:
def collate_fn(data):
    """This function will be used to pad the sessions to max length
       in the batch and transpose the batch from 
       batch_size x max_seq_len to max_seq_len x batch_size.
       It will return padded vectors, labels and lengths of each session (before padding)
       It will be used in the Dataloader
    """
    data.sort(key=lambda x: len(x[0]), reverse=True)
    lens = [len(sess) for sess, label in data]
    labels = []
    padded_sesss = torch.zeros(len(data), max(lens)).long()
    for i, (sess, label) in enumerate(data):
        padded_sesss[i,:lens[i]] = torch.LongTensor(sess)
        labels.append(label)
    
    padded_sesss = padded_sesss.transpose(0,1)
    return padded_sesss, torch.tensor(labels).long(), lens

In [4]:

import os
import time
import random
import argparse
import pickle
import numpy as np
from tqdm import tqdm
from os.path import join

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.autograd import Variable
from torch.backends import cudnn


class Parameters():
    def __init__(self):
      self.dataset_path = "/content/drive/MyDrive/RecSys/FinalProject/Neural-Attentive-Session-Based-Recommendation-PyTorch-master/datasets/diginetica/"
      self.batch_size = 512
      self.hidden_size = 100
      self.embed_dim = 50
      self.valid_portion = 0.1
      self.topk = 20


# here = os.path.dirname(os.path.abspath(__file__))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class NARM(nn.Module):
    """Neural Attentive Session Based Recommendation Model Class

    Args:
        n_items(int): the number of items
        hidden_size(int): the hidden size of gru
        embedding_dim(int): the dimension of item embedding
        batch_size(int): 
        n_layers(int): the number of gru layers

    """
    def __init__(self, n_items, hidden_size, embedding_dim, batch_size, n_layers = 1):
        super(NARM, self).__init__()
        self.n_items = n_items
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.n_layers = n_layers
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(self.n_items, self.embedding_dim, padding_idx = 0)
        self.emb_dropout = nn.Dropout(0.25)
        self.gru = nn.GRU(self.embedding_dim, self.hidden_size, self.n_layers)
        self.a_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.a_2 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.v_t = nn.Linear(self.hidden_size, 1, bias=False)
        self.ct_dropout = nn.Dropout(0.5)
        self.b = nn.Linear(self.embedding_dim, 2 * self.hidden_size, bias=False)
        #self.sf = nn.Softmax()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def forward(self, seq, lengths):
        hidden = self.init_hidden(seq.size(1))
        embs = self.emb_dropout(self.emb(seq))
        embs = pack_padded_sequence(embs, lengths)
        gru_out, hidden = self.gru(embs, hidden)
        gru_out, lengths = pad_packed_sequence(gru_out)

        # fetch the last hidden state of last timestamp
        ht = hidden[-1]
        gru_out = gru_out.permute(1, 0, 2)

        c_global = ht
        q1 = self.a_1(gru_out.contiguous().view(-1, self.hidden_size)).view(gru_out.size())  
        q2 = self.a_2(ht)

        mask = torch.where(seq.permute(1, 0) > 0, torch.tensor([1.], device = self.device), torch.tensor([0.], device = self.device))
        q2_expand = q2.unsqueeze(1).expand_as(q1)
        q2_masked = mask.unsqueeze(2).expand_as(q1) * q2_expand

        alpha = self.v_t(torch.sigmoid(q1 + q2_masked).view(-1, self.hidden_size)).view(mask.size())
        c_local = torch.sum(alpha.unsqueeze(2).expand_as(gru_out) * gru_out, 1)

        c_t = torch.cat([c_local, c_global], 1)
        c_t = self.ct_dropout(c_t)
        
        item_embs = self.emb(torch.arange(self.n_items).to(self.device))
        scores = torch.matmul(c_t, self.b(item_embs).permute(1, 0))
        # scores = self.sf(scores)

        return scores

    def init_hidden(self, batch_size):
        return torch.zeros((self.n_layers, batch_size, self.hidden_size), requires_grad=True).to(self.device)
        


In [6]:
print('Loading data...')
args = Parameters()
train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_portion)

train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)
train_loader = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
valid_loader = DataLoader(valid_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)
test_loader = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

if args.dataset_path.split('/')[-2] == 'diginetica':
    n_items = 43098
elif args.dataset_path.split('/')[-2] in ['yoochoose1_64', 'yoochoose1_4']:
    n_items = 37484
else:
    raise Exception('Unknown Dataset!')

model = NARM(n_items, args.hidden_size, args.embed_dim, args.batch_size).to(device)

ckpt = torch.load("/content/latest_checkpoint.pth.tar")
model.load_state_dict(ckpt['state_dict'])
recall, mrr = validate(test_loader, model)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall, args.topk, mrr))


Loading data...
--------------------------------------------------
Dataset info:
Number of sessions: 647523
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 71947
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 60858
--------------------------------------------------


FileNotFoundError: ignored

In [None]:
train_data.data[0]

In [None]:

# dataBefore = "/content/drive/MyDrive/RecSys/FinalProject/Neural-Attentive-Session-Based-Recommendation-PyTorch-master/datasets/train-item-views.csv" #Path to Original Training Dataset "Clicks" File
# dataTestBefore = '/content/drive/MyDrive/RecSys/FinalProject/Neural-Attentive-Session-Based-Recommendation-PyTorch-master/datasets/train-item-views.csv' #Path to Original Testing Dataset "Clicks" File
# dataAfter = '/content/drive/MyDrive/RecSys/FinalProject/Neural-Attentive-Session-Based-Recommendation-PyTorch-master/datasets/Preprocessed_data/' #Path to Processed Dataset Folder
# dayTime = 86400 #Validation Only one day = 86400 seconds


In [8]:
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 10 09:50:45 2019
@author: s-moh
"""
import numpy as np
import pandas as pd
import datetime

# dataBefore = "C:/Users/agotl/Downloads/yoochoose-clicks.dat/yoochoose-clicks.dat" #Path to Original Training Dataset "Clicks" File
# dataTestBefore = 'C:/Users/agotl/Downloads/yoochoose-test.dat/yoochoose-test.dat' #Path to Original Testing Dataset "Clicks" File
# dataAfter = 'C:/Users/agotl/PycharmProjects/Rec_Sys/Preprocessed_data/' #Path to Processed Dataset Folder
# dayTime = 86400 #Validation Only one day = 86400 seconds


dataBefore = "/content/drive/MyDrive/RecSys/FinalProject/Neural-Attentive-Session-Based-Recommendation-PyTorch-master/datasets/train-item-views.csv" #Path to Original Training Dataset "Clicks" File
# dataTestBefore = '/content/drive/MyDrive/RecSys/FinalProject/Neural-Attentive-Session-Based-Recommendation-PyTorch-master/datasets/train-item-views.csv' #Path to Original Testing Dataset "Clicks" File
dataAfter = '/content/drive/MyDrive/RecSys/FinalProject/Neural-Attentive-Session-Based-Recommendation-PyTorch-master/datasets/Preprocessed_data/' #Path to Processed Dataset Folder
dayTime = 86400 #Validation Only one day = 86400 seconds

def removeShortSessions(data):
    #delete sessions of length < 1
    sessionLen = data.groupby('SessionID').size() #group by sessionID and get size of each session
    data = data[np.in1d(data.SessionID, sessionLen[sessionLen > 1].index)]
    return data

#Read Dataset in pandas Dataframe (Ignore Category Column)
train = pd.read_csv(dataBefore, sep=';', header=None, usecols=[0,1,2,3,4]) #, dtype={0:np.int64,1:np.int64, 2:np.int64, 3:str, 4:str})
# test = pd.read_csv(dataTestBefore, sep=';', header=None, usecols=[0,1,2, 3, 4], dtype={0:np.int32, 1:str, 2:np.int64})
train.columns = ['SessionID', 'UserID', 'ItemID', 'Time', 'EventDate'] #Headers of dataframe
# test.columns = ['SessionID', 'UserID', 'ItemID', 'Time', 'EventDate'] #Headers of dataframe
train = train[1:]
# train['Time'] = train.Time.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()) #Convert time objects to timestamp
# test['Time'] = test.Time.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()) #Convert time objects to timestamp

#remove sessions of less than 2 interactions
# train = removeShortSessions(train)
#delete records of items which appeared less than 5 times
itemLen = train.groupby('ItemID').size() #groupby itemID and get size of each item
train = train[np.in1d(train.ItemID, itemLen[itemLen > 4].index)]
#remove sessions of less than 2 interactions again
train = removeShortSessions(train)

######################################################################################################3
'''
#Separate Data into Train and Test Splits
timeMax = data.Time.max() #maximum time in all records
sessionMaxTime = data.groupby('SessionID').Time.max() #group by sessionID and get the maximum time of each session
sessionTrain = sessionMaxTime[sessionMaxTime < (timeMax - dayTime)].index #training split is all sessions that ended before the last day
sessionTest  = sessionMaxTime[sessionMaxTime >= (timeMax - dayTime)].index #testing split is all sessions has records in the last day
train = data[np.in1d(data.SessionID, sessionTrain)]
test = data[np.in1d(data.SessionID, sessionTest)]
'''
#Delete records in testing split where items are not in training split
# test = test[np.in1d(test.ItemID, train.ItemID)]
#Delete Sessions in testing split which are less than 2
# test = removeShortSessions(test)

#Convert To CSV
#print('Full Training Set has', len(train), 'Events, ', train.SessionID.nunique(), 'Sessions, and', train.ItemID.nunique(), 'Items\n\n')
#train.to_csv(dataAfter + 'recSys15TrainFull.txt', sep='\t', index=False)
print('Testing Set has', len(test), 'Events, ', test.SessionID.nunique(), 'Sessions, and', test.ItemID.nunique(), 'Items\n\n')
# test.to_csv(dataAfter + 'recSys15Test.txt', sep=',', index=False)

######################################################################################################3


  exec(code_obj, self.user_global_ns, self.user_ns)


KeyboardInterrupt: ignored

In [None]:
!pip install swifter 
import swifter 

Collecting swifter
  Downloading swifter-1.1.2.tar.gz (633 kB)
[?25l[K     |▌                               | 10 kB 30.4 MB/s eta 0:00:01[K     |█                               | 20 kB 29.9 MB/s eta 0:00:01[K     |█▌                              | 30 kB 14.4 MB/s eta 0:00:01[K     |██                              | 40 kB 11.0 MB/s eta 0:00:01[K     |██▋                             | 51 kB 9.9 MB/s eta 0:00:01[K     |███                             | 61 kB 10.6 MB/s eta 0:00:01[K     |███▋                            | 71 kB 9.5 MB/s eta 0:00:01[K     |████▏                           | 81 kB 10.5 MB/s eta 0:00:01[K     |████▋                           | 92 kB 9.4 MB/s eta 0:00:01[K     |█████▏                          | 102 kB 9.4 MB/s eta 0:00:01[K     |█████▊                          | 112 kB 9.4 MB/s eta 0:00:01[K     |██████▏                         | 122 kB 9.4 MB/s eta 0:00:01[K     |██████▊                         | 133 kB 9.4 MB/s eta 0:00:01[K     |█

In [None]:
train['Time'] = train['Time'].astype(int)
train['Time'] = train.swifter.apply(lambda r: (datetime.datetime.strptime(r['EventDate'], "%Y-%m-%d") + datetime.timedelta(milliseconds=r['Time'])).timestamp(), axis=1)
print('Testing Set has', len(test), 'Events, ', test.SessionID.nunique(), 'Sessions, and', test.ItemID.nunique(), 'Items\n\n')
# test.to_csv(dataAfter + 'recSys15Test.txt', sep=',', index=False)

######################################################################################################3
#Separate Training set into Train and Validation Splits
timeMax = train.Time.max()
sessionMaxTime = train.groupby('SessionID').Time.max()
sessionTrain = sessionMaxTime[sessionMaxTime < (timeMax - dayTime)].index #training split is all sessions that ended before the last 2nd day
sessionValid = sessionMaxTime[sessionMaxTime >= (timeMax - dayTime)].index #validation split is all sessions that ended during the last 2nd day
trainTR = train[np.in1d(train.SessionID, sessionTrain)]
trainVD = train[np.in1d(train.SessionID, sessionValid)]
#Delete records in validation split where items are not in training split
trainVD = trainVD[np.in1d(trainVD.ItemID, trainTR.ItemID)]
#Delete Sessions in testing split which are less than 2
trainVD = removeShortSessions(trainVD)
#Convert To CSV
print('Training Set has', len(trainTR), 'Events, ', trainTR.SessionID.nunique(), 'Sessions, and', trainTR.ItemID.nunique(), 'Items\n\n')
trainTR.to_csv(dataAfter + 'recSys15TrainOnly.txt', sep=',', index=False)
print('Validation Set has', len(trainVD), 'Events, ', trainVD.SessionID.nunique(), 'Sessions, and', trainVD.ItemID.nunique(), 'Items\n\n')
trainVD.to_csv(dataAfter + 'recSys15Valid.txt', sep=',', index=False)

Pandas Apply:   0%|          | 0/949540 [00:00<?, ?it/s]

Testing Set has 1235381 Events,  310325 Sessions, and 156813 Items


Training Set has 939090 Events,  197448 Sessions, and 48305 Items


Validation Set has 10443 Events,  2175 Sessions, and 6300 Items


