## Settings

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append('..')

# Main

In [2]:
import json
from collections import defaultdict
import torch

## Load Data

In [None]:
import pandas as pd

In [None]:
clicks = pd.read_csv('../data/raw/yoochoose-clicks.dat',
        names=['sess', 'ts', 'item', 'cat'],  dtype={'cat': str},
        usecols=['sess', 'ts', 'item'], header=None)
clicks.head()

## Preprocess

In [None]:
from datetime import datetime as dt, timedelta as td

In [None]:
clicks['ts'] = clicks['ts'].apply(lambda s: dt.strptime(s[:19], '%Y-%m-%dT%H:%M:%S'))
clicks.head()

In [None]:
splitdate = max(clicks['ts']) - td(1)
item_count = clicks['item'].value_counts()

In [None]:
remain_sess = []
remain_item = set()
for _, group in clicks.groupby('sess', sort=False):
    print(group.iat[0, 0], end='\r')
    gi = group['item'].tolist()
    n = len(gi)
    stop = False
    if n > 1:
        for item in gi:
            if item_count[item] < 5:
                stop = True
                break
    else:
        stop = True
    if not stop:
        remain_sess.append((str(group.iat[0, 0]), group.iat[0, 1], gi))
        for item in gi:
            remain_item.add(item)
with open('../data/interim/n_items.json', 'w') as f:
    json.dump(len(remain_item), f)

In [None]:
from sklearn.preprocessing import LabelEncoder

remain_item = list(remain_item)
item_enc = LabelEncoder()
item_enc.fit(remain_item)

In [None]:
train_d = defaultdict(list)
test_d = defaultdict(list)
for sess, ts, items in remain_sess:
    print(sess, end='\r')
    items = item_enc.transform(items).tolist()
    if ts < splitdate:
        for i in range(1, len(items)):
            train_d[sess].append((items[: i], items[i]))
    else:
        for i in range(1, len(items)):
            test_d[sess].append((items[: i], items[i]))
with open('../data/interim/train.json', 'w') as f:
    json.dump(train_d, f)
with open('../data/interim/test.json', 'w') as f:
    json.dump(test_d, f)

## Prepare Input Data

In [3]:
from torch_geometric.data import Data, Dataset, DataLoader

In [21]:
class YooChooseDataset(Dataset):
    def __init__(self, d):
        super(YooChooseDataset, self).__init__()
        self.samples = self.add_from_dict(d)
        
    def add_from_dict(self, d):
        samples = []
        for dd in d.values():
            for data in dd:
                samples.append(data)
        return samples
                
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        x_ids, y = self.samples[idx]
        x_ids_ = set(x_ids)
        x_ids_.remove(x_ids[- 1])
        x_ids_ = list(x_ids_) + [x_ids[- 1]]
        x_dict = {x_id: i for i, x_id in enumerate(x_ids_)}
        x = [[x_id] for x_id in x_ids_]
        edge_dict = defaultdict(lambda: defaultdict(int)) # 얘네를 다 미리 저장해야 할 듯? f-b propagation step에 비해 오래 걸리는지 확인해보고 일단
        for i in range(len(x_ids) - 1):
            edge_dict[x_dict[x_ids[i]]][x_dict[x_ids[i + 1]]] += 1
        edge_index, edge_weights = [], []
        for o in edge_dict.keys():
            s = sum(edge_dict[o].values())
            for d in edge_dict[o].keys():
                edge_index.append([o, d])
                edge_weights.append(edge_dict[o][d] / s)
        x = torch.tensor(x, dtype=torch.long)
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_weights = torch.tensor(edge_weights)
        return Data(x, edge_index=edge_index, edge_weights=edge_weights, y=y)

In [5]:
with open('../data/interim/n_items.json', 'r') as f:
    n_items = json.load(f)
with open('../data/interim/train.json', 'r') as f:
    train_d = json.load(f)
with open('../data/interim/test.json', 'r') as f:
    test_d = json.load(f)
print('# items: {}\n# train sessions: {}\n# test sessions: {}'
        .format(n_items, len(train_d), len(test_d)))

# items: 37821
# train sessions: 7957313
# test sessions: 15270


In [None]:
# train, validation split

In [22]:
# train_loader, val_loader, test_loader =====================split을 해서 다 만들어놔야혀,
test_dataset = YooChooseDataset(test_d)
test_loader = DataLoader(test_dataset, batch_size=(2 ** 14), shuffle=True)

## Construct Model

In [23]:
from torch import nn
from torch_geometric.nn import GatedGraphConv

class Attention(nn.Module):
    def __init__(self, embed_dim):
        super(Attention, self).__init__()
        self.wq = nn.Linear(embed_dim, embed_dim)
        self.wk = nn.Linear(embed_dim, embed_dim)
        self.sigmoid = nn.Sigmoid()
        self.q = nn.Linear(embed_dim, embed_dim)
        self.w = nn.Linear(2 * embed_dim, embed_dim)
        
    def forward(self, x, batch):
        sections = list(torch.bincount(batch).to('cpu').numpy())
        x_split = torch.split(x, sections)
        q_split = [x_[- 1].view(1, - 1) for x_ in x_split]
        q = torch.cat([x_[- 1].view(1, - 1).repeat(x_.shape[0], 1) for x_ in x_split])
        q = self.wq(q)
        k = self.wk(x)
        alpha = self.q(q + k)
        ax = alpha * x
        ax_split = torch.split(ax, sections)
        sg_split = [torch.sum(ax_).view(1, - 1) for ax_ in ax_split]
        sh_split = self.w(torch.cat((torch.cat(q_split), torch.cat(sg_split)), 1))
        return sh_split
    
class PredProb(nn.Module):
    def __init__(self, )

class SRGNN(nn.Module): # 마지막 클릭된 item의 index를 가져와야할 것 같은데 yoochoosedataset에서
    def __init__(self, n_items, embed_dim):
        super(SRGNN, self).__init__()
        self.embedding = nn.Embedding(n_items, embed_dim)
        self.gatedgconv = GatedGraphConv(embed_dim, 1)
        self.relu = nn.ReLU()
        self.attention = Attention(embed_dim)
        
    def _initialize_weights(self, ):
        pass
    
    def forward(self, data):
        x, edge_index, edge_weights, batch =\
                data.x, data.edge_index, data.edge_weights, data.batch
        x = self.embedding(x).squeeze()
        x = self.gatedgconv(x, edge_index, edge_weight)
        x = self.relu(x)
        x = self.attention(x, batch)

In [7]:
from torch import nn
from torch_geometric.nn import GatedGraphConv

class Embedding2Score(nn.Module):
    def __init__(self, hidden_size):
        super(Embedding2Score, self).__init__()
        self.hidden_size = hidden_size
        self.W_1 = nn.Linear(self.hidden_size, self.hidden_size)
        self.W_2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.q = nn.Linear(self.hidden_size, 1)
        self.W_3 = nn.Linear(2 * self.hidden_size, self.hidden_size)

    def forward(self, session_embedding, all_item_embedding, batch):
        sections = torch.bincount(batch)
        v_i = torch.split(session_embedding, tuple(sections.cpu().numpy()))    # split whole x back into graphs G_i
        v_n_repeat = tuple(nodes[-1].view(1, -1).repeat(nodes.shape[0], 1) for nodes in v_i)    # repeat |V|_i times for the last node embedding

        # Eq(6)
        alpha = self.q(torch.sigmoid(self.W_1(torch.cat(v_n_repeat, dim=0)) + self.W_2(session_embedding)))    # |V|_i * 1
        s_g_whole = alpha * session_embedding    # |V|_i * hidden_size
        s_g_split = torch.split(s_g_whole, tuple(sections.cpu().numpy()))    # split whole s_g into graphs G_i
        s_g = tuple(torch.sum(embeddings, dim=0).view(1, -1) for embeddings in s_g_split)
        
        # Eq(7)
        v_n = tuple(nodes[-1].view(1, -1) for nodes in v_i)
        s_h = self.W_3(torch.cat((torch.cat(v_n, dim=0), torch.cat(s_g, dim=0)), dim=1))
        
        # Eq(8)
        z_i_hat = torch.mm(s_h, all_item_embedding.weight.transpose(1, 0))
        
        return z_i_hat

In [11]:
eeee = nn.Embedding(n_items, 8)
gggg = GatedGraphConv(8, 1)
ee2ss = Embedding2Score(8)
xxxx = gggg(eeee(test_dataset[46].x).squeeze(), test_dataset[46].edge_index)
xxxx

tensor([[ 0.2807, -0.1433,  0.1594, -0.5333,  0.3918,  0.5824, -0.0843,  0.2851],
        [-0.2402,  0.1186, -0.4569,  0.7095, -0.5262, -0.7287, -0.0643,  0.1135],
        [-0.6767, -1.4324, -0.0564,  0.1371,  0.1189, -0.7609, -0.2928,  0.9821],
        [ 0.4706, -0.9954,  0.1051,  0.0157,  0.2814,  0.2988,  0.2134, -0.9702],
        [ 0.0987,  0.6412, -1.1929,  0.1105,  0.3293,  1.0259,  0.3699,  0.6783],
        [ 0.0304,  0.3570, -0.1339,  1.2353,  0.3167,  0.3608, -0.3312, -1.0508]],
       grad_fn=<AddBackward0>)

In [17]:
ee2ss(xxxx, eeee, batch=torch.tensor([0, 0, 0, 0, 0, 0]))

tensor([[ 1.5191, -1.6180, -2.1014,  ..., -0.5146,  0.8934,  0.5470]],
       grad_fn=<MmBackward>)

In [None]:
print(test_dataset[46].x)
print(test_dataset[46].edge_index)
print(test_dataset[46].edge_weights)
print(test_dataset[46].y)

In [39]:
torch.split(torch.tensor([1, 2, 3, 1, 2, 1, 2, 3, 4]), (3, 2, 4))

(tensor([1, 2, 3]), tensor([1, 2]), tensor([1, 2, 3, 4]))

In [40]:
(i for i in range(3))

<generator object <genexpr> at 0x7f4ceda780f8>

In [None]:
"""
from sklearn.preprocessing import OneHotEncoder

remain_item = item_enc.transform(remain_item).tolist()
item_enc = OneHotEncoder(sparse=False)
item_enc.fit([[item] for item in remain_item])
with open('../data/interim/onehotencoder.pkl', 'wb') as f:
    pickle.dump(item_enc, f)
"""

In [None]:
import csv
with open('../data/raw/yoochoose-clicks.dat', 'r') as f:
    reader = csv.DictReader(f, delimiter=',')
    for i, data in enumerate(reader):
        print(data)
        if i == 4:
            break

In [None]:
buys.info()