## Settings

In [None]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append('..')

# Main

In [None]:
import json
from collections import defaultdict

import torch
from torch import nn
from ignite.metrics import TopKCategoricalAccuracy, Loss

In [None]:
batch_size = 2 ** 13

loss_fn = nn.CrossEntropyLoss()
opt_ = torch.optim.Adam
lr = 0.00003
val_metrics = {
        'top-10 acc': TopKCategoricalAccuracy(10),
        'loss': Loss(loss_fn)
        }
device = 'cuda:1'
max_epochs = 1000

## Load Data

In [None]:
import pandas as pd

In [None]:
clicks = pd.read_csv('../data/raw/yoochoose-clicks.dat',
        names=['sess', 'ts', 'item', 'cat'],  dtype={'cat': str},
        usecols=['sess', 'ts', 'item'], header=None)
clicks.head()

## Preprocess

In [None]:
from datetime import datetime as dt, timedelta as td

In [None]:
clicks['ts'] = clicks['ts'].apply(lambda s: dt.strptime(s[:19], '%Y-%m-%dT%H:%M:%S'))
clicks.head()

In [None]:
splitdate = max(clicks['ts']) - td(1)
item_count = clicks['item'].value_counts()

In [None]:
remain_sess = []
remain_item = set()
for _, group in clicks.groupby('sess', sort=False):
    print(group.iat[0, 0], end='\r')
    gi = group['item'].tolist()
    n = len(gi)
    stop = False
    if n > 1:
        for item in gi:
            if item_count[item] < 5:
                stop = True
                break
    else:
        stop = True
    if not stop:
        remain_sess.append((str(group.iat[0, 0]), group.iat[0, 1], gi))
        for item in gi:
            remain_item.add(item)
with open('../data/interim/n_items.json', 'w') as f:
    json.dump(len(remain_item), f)

In [None]:
from sklearn.preprocessing import LabelEncoder

remain_item = list(remain_item)
item_enc = LabelEncoder()
item_enc.fit(remain_item)

In [None]:
train_d = defaultdict(list)
test_d = defaultdict(list)
for sess, ts, items in remain_sess:
    print(sess, end='\r')
    items = item_enc.transform(items).tolist()
    if ts < splitdate:
        for i in range(1, len(items)):
            train_d[sess].append((items[: i], items[i]))
    else:
        for i in range(1, len(items)):
            test_d[sess].append((items[: i], items[i]))
with open('../data/interim/train.json', 'w') as f:
    json.dump(train_d, f)
with open('../data/interim/test.json', 'w') as f:
    json.dump(test_d, f)

## Prepare Input Data

In [None]:
from torch_geometric.data import Data, Dataset, DataLoader

In [None]:
class YooChooseDataset(Dataset):
    def __init__(self, d):
        super(YooChooseDataset, self).__init__()
        self.samples = self.add_from_dict(d)
        
    def add_from_dict(self, d):
        samples = []
        for dd in d.values():
            for data in dd:
                samples.append(data)
        
        x_ids, y = self.samples[idx]
        x_ids_ = set(x_ids)
        x_ids_.remove(x_ids[- 1])
        x_ids_ = list(x_ids_) + [x_ids[- 1]]
        x_dict = {x_id: i for i, x_id in enumerate(x_ids_)}
        x = [[x_id] for x_id in x_ids_]
        edge_dict = defaultdict(lambda: defaultdict(int)) # 얘네를 다 미리 저장해야 할 듯? f-b propagation step에 비해 오래 걸리는지 확인해보고 일단
        for i in range(len(x_ids) - 1):
            edge_dict[x_dict[x_ids[i]]][x_dict[x_ids[i + 1]]] += 1
        edge_index, edge_weights = [], []
        for o in edge_dict.keys():
            s = sum(edge_dict[o].values())
            for d in edge_dict[o].keys():
                edge_index.append([o, d])
                edge_weights.append(edge_dict[o][d] / s)
        x = torch.tensor(x, dtype=torch.long)
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_weights = torch.tensor(edge_weights)
        samples.append((Data(x, edge_index=edge_index, edge_weights=edge_weights), y))
        return samples
                
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        return {
                'graph': samples[idx][0],
                'label': samples[idx][1]
                }

In [None]:
with open('../data/interim/n_items.json', 'r') as f:
    n_items = json.load(f)
with open('../data/interim/train.json', 'r') as f:
    train_d = json.load(f)
with open('../data/interim/test.json', 'r') as f:
    test_d = json.load(f)
print('# items: {}\n# train sessions: {}\n# test sessions: {}'
        .format(n_items, len(train_d), len(test_d)))

In [None]:
from math import floor
from torch.utils.data import random_split

train_dataset = YooChooseDataset(train_d)
train_dataset, val_dataset = random_split(train_dataset,
        [floor(0.9 * len(train_dataset)),
                len(train_dataset) - floor(0.9 * len(train_dataset))])
test_dataset = YooChooseDataset(test_d)
print('# train samples: {}\n# val samples: {}\n# test samples: {}'
        .format(len(train_dataset), len(val_dataset), len(test_dataset)))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=24)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=24)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=24)

## Construct Model

In [None]:
from torch_geometric.nn import GatedGraphConv

class Attention(nn.Module):
    def __init__(self, embed_dim):
        super(Attention, self).__init__()
        self.wq = nn.Linear(embed_dim, embed_dim)
        self.wk = nn.Linear(embed_dim, embed_dim)
        self.sigmoid = nn.Sigmoid()
        self.q = nn.Linear(embed_dim, 1)
        self.w = nn.Linear(2 * embed_dim, embed_dim)
        
    def forward(self, x, batch):
        sections = list(torch.bincount(batch).to('cpu').numpy())
        x_split = torch.split(x, sections)
        q_split = [x_[- 1].view(1, - 1) for x_ in x_split]
        q = torch.cat([x_[- 1].view(1, - 1).repeat(x_.shape[0], 1) for x_ in x_split])
        q = self.wq(q)
        k = self.wk(x)
        a = self.q(q + k)
        ax = a * x
        ax_split = torch.split(ax, sections)
        sg_split = [torch.sum(ax_, 0).view(1, - 1) for ax_ in ax_split]
        sh_split = self.w(torch.cat((torch.cat(q_split), torch.cat(sg_split)), 1))
        return sh_split
    
class PredProb(nn.Module):
    def __init__(self):
        super(PredProb, self).__init__()
    
    def forward(self, sh, embedding):
        return torch.mm(sh, embedding.weight.transpose(1, 0))

class SRGNN(nn.Module):
    def __init__(self, n_items, embed_dim):
        super(SRGNN, self).__init__()
        self.embedding = nn.Embedding(n_items, embed_dim)
        self.gatedgconv = GatedGraphConv(embed_dim, 1)
        self.relu = nn.ReLU()
        self.attention = Attention(embed_dim)
        self.predprob = PredProb()
        
    def _initialize_weights(self, ):
        pass
    
    def forward(self, data):
        x, edge_index, edge_weights, batch =\
                data.x, data.edge_index, data.edge_weights, data.batch
        x = self.embedding(x).squeeze()
        x = self.gatedgconv(x, edge_index, edge_weights)
        x = self.relu(x)
        x = self.attention(x, batch)
        x = self.predprob(x, self.embedding)
        return x

In [None]:
model = SRGNN(n_items, 128)
for b in model.named_children():
    print(b)

## Train

In [None]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator

def train_net(net, opt, loss_fn, val_metrics, train_loader, val_loader, device):
    net.to(device)
    def prepare_batch(batch, device, non_blocking=False):
        x, y = batch.values()
        return x.to(device), y.to(device)
    def output_transform(x, y, y_pred, loss):
        return (y_pred.max(1)[1], y)
    trainer = create_supervised_trainer(net, opt, loss_fn, device,
            prepare_batch=prepare_batch, output_transform=output_transform)
    evaluator = create_supervised_evaluator(net, val_metrics, device,
            prepare_batch=prepare_batch)
    s = '{}: {:.2f} '
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        print('Epoch {}'.format(trainer.state.epoch))
        message = 'Train - '
        for k, v in val_metrics.keys():
            message += s.format(m, evaluator.state.metrics[m])
        print(message)
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        evaluator.run(val_loader)
        message = 'Val   - '
        for m in val_metrics.keys():
            message += s.format(m, evaluator.state.metrics[m])
        print(message)
    return trainer

In [None]:
opt = opt_(model.parameters(), lr)

trainer = train_net(model, opt, loss_fn, val_metrics,
        train_loader, val_loader, device)
trainer.run(train_loader, max_epochs=max_epochs)

In [None]:
eeee = nn.Embedding(n_items, 8)
gggg = GatedGraphConv(8, 1)
ee2ss = Embedding2Score(8)
xxxx = gggg(eeee(test_dataset[46].x).squeeze(), test_dataset[46].edge_index)
xxxx

In [None]:
ee2ss(xxxx, eeee, batch=torch.tensor([0, 0, 0, 0, 0, 0]))

In [None]:
idx = 50
print(test_dataset[idx].x)
print(test_dataset[idx].edge_index)
print(test_dataset[idx].edge_weights)
print(test_dataset[idx].y)

In [None]:
class YooChooseDataset(Dataset):
    def __init__(self, d):
        super(YooChooseDataset, self).__init__()
        self.samples = self.add_from_dict(d)
        
    def add_from_dict(self, d):
        samples = []
        for dd in d.values():
            for data in dd:
                samples.append(data)
        return samples
                
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        x_ids, y = self.samples[idx]
        x_ids_ = set(x_ids)
        x_ids_.remove(x_ids[- 1])
        x_ids_ = list(x_ids_) + [x_ids[- 1]]
        x_dict = {x_id: i for i, x_id in enumerate(x_ids_)}
        x = [[x_id] for x_id in x_ids_]
        edge_dict = defaultdict(lambda: defaultdict(int)) # 얘네를 다 미리 저장해야 할 듯? f-b propagation step에 비해 오래 걸리는지 확인해보고 일단
        for i in range(len(x_ids) - 1):
            edge_dict[x_dict[x_ids[i]]][x_dict[x_ids[i + 1]]] += 1
        edge_index, edge_weights = [], []
        for o in edge_dict.keys():
            s = sum(edge_dict[o].values())
            for d in edge_dict[o].keys():
                edge_index.append([o, d])
                edge_weights.append(edge_dict[o][d] / s)
        x = torch.tensor(x, dtype=torch.long)
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_weights = torch.tensor(edge_weights)
        return {
                'graph': Data(x, edge_index=edge_index, edge_weights=edge_weights),
                'label': y
                }

In [None]:
"""
from sklearn.preprocessing import OneHotEncoder

remain_item = item_enc.transform(remain_item).tolist()
item_enc = OneHotEncoder(sparse=False)
item_enc.fit([[item] for item in remain_item])
with open('../data/interim/onehotencoder.pkl', 'wb') as f:
    pickle.dump(item_enc, f)
"""

In [None]:
import csv
with open('../data/raw/yoochoose-clicks.dat', 'r') as f:
    reader = csv.DictReader(f, delimiter=',')
    for i, data in enumerate(reader):
        print(data)
        if i == 4:
            break

In [None]:
buys.info()