## Settings

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append('..')

# Main

In [41]:
import torch

## Load Data

In [2]:
import pandas as pd
import json
import pickle

In [3]:
clicks = pd.read_csv('../data/raw/yoochoose-clicks.dat',
        names=['sess', 'ts', 'item', 'cat'],  dtype={'cat': str},
        usecols=['sess', 'ts', 'item'], header=None)
clicks.head()

Unnamed: 0,sess,ts,item
0,1,2014-04-07T10:51:09.277Z,214536502
1,1,2014-04-07T10:54:09.868Z,214536500
2,1,2014-04-07T10:54:46.998Z,214536506
3,1,2014-04-07T10:57:00.306Z,214577561
4,2,2014-04-07T13:56:37.614Z,214662742


## Preprocess

In [4]:
from datetime import datetime as dt, timedelta as td
from collections import defaultdict

In [5]:
clicks['ts'] = clicks['ts'].apply(lambda s: dt.strptime(s[:19], '%Y-%m-%dT%H:%M:%S'))
clicks.head()

Unnamed: 0,sess,ts,item
0,1,2014-04-07 10:51:09,214536502
1,1,2014-04-07 10:54:09,214536500
2,1,2014-04-07 10:54:46,214536506
3,1,2014-04-07 10:57:00,214577561
4,2,2014-04-07 13:56:37,214662742


In [6]:
splitdate = max(clicks['ts']) - td(1)
item_count = clicks['item'].value_counts()

In [7]:
remain_sess = []
remain_item = set()
for _, group in clicks.groupby('sess', sort=False):
    print(group.iat[0, 0], end='\r')
    gi = group['item'].tolist()
    n = len(gi)
    stop = False
    if n > 1:
        for item in gi:
            if item_count[item] < 5:
                stop = True
                break
    else:
        stop = True
    if not stop:
        remain_sess.append((str(group.iat[0, 0]), group.iat[0, 1], gi))
        for item in gi:
            remain_item.add(item)

11299811

In [8]:
from sklearn.preprocessing import LabelEncoder

remain_item = list(remain_item)
item_enc = LabelEncoder()
item_enc.fit(remain_item)

LabelEncoder()

In [9]:
train_d = defaultdict(list)
test_d = defaultdict(list)
for sess, ts, items in remain_sess:
    print(sess, end='\r')
    items = item_enc.transform(items).tolist()
    if ts < splitdate:
        for i in range(1, len(items)):
            train_d[sess].append((items[: i], items[i]))
    else:
        for i in range(1, len(items)):
            test_d[sess].append((items[: i], items[i]))
with open('../data/interim/train.json', 'w') as f:
    json.dump(train_d, f)
with open('../data/interim/test.json', 'w') as f:
    json.dump(test_d, f)

11299811

In [23]:
from sklearn.preprocessing import OneHotEncoder

remain_item = item_enc.transform(remain_item).tolist()
item_enc = OneHotEncoder(sparse=False)
item_enc.fit([[item] for item in remain_item])
with open('../data/interim/onehotencoder.pkl', 'wb') as f:
    pickle.dump(item_enc, f)

## Prepare Input Data

In [None]:
# train, validation split

In [25]:
with open('../data/interim/train.json', 'r') as f:
    train_d = json.load(f)
with open('../data/interim/test.json', 'r') as f:
    test_d = json.load(f)
print(len(train_d), len(test_d))
with open('../data/interim/onehotencoder.pkl', 'rb') as f:
    item_enc = pickle.load(f)

In [28]:
from torch_geometric.data import Data, Dataset, DataLoader

In [141]:
class YooChooseDataset(Dataset):
    def __init__(self, d):
        super(YooChooseDataset, self).__init__()
        self.samples = self.add_from_dict(d)
        
    def add_from_dict(self, d):
        samples = []
        for dd in d.values():
            for data in dd:
                samples.append(data)
        return samples
                
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx): # edge_weight(1-dim; 각 edge당 weight 한개)를 GatedGraphConv에 kwarg로 넣어야됨 여기에도 있어야 할 듯
        x_ids, y = self.samples[idx]
        x_ids_ = list(set(x_ids))
        x_dict = {x_id: i for i, x_id in enumerate(x_ids_)}
        x = item_enc.transform([[x_id] for x_id in x_ids_])
        edge_dict = defaultdict(lambda: defaultdict(int)) # 얘네를 다 미리 저장해야 할 듯? processing이 한세월이겄네
        for i in range(len(x_ids) - 1):
            edge_dict[x_dict[x_ids[i]]][x_dict[x_ids[i + 1]]] += 1
        edge_index, edge_weights = [], []
        for o in edge_dict.keys():
            s = sum(edge_dict[o].values())
            for d in edge_dict[o].keys():
                edge_index.append([o, d])
                edge_weights.append(edge_dict[o][d] / s)
        x = torch.from_numpy(x)
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_weights = torch.tensor(edge_weights)
        y = torch.tensor([[y]])
        return Data(x, edge_index=edge_index, edge_weights=edge_weights, y=y)

In [127]:
print(type(test_dataset[46].x))
print(test_dataset[46].edge_index)
print(test_dataset[46].edge_weights)
print(test_dataset[46].y)

<class 'numpy.ndarray'>
tensor([[0, 1, 1, 1, 4, 5, 3, 3, 2],
        [1, 1, 4, 3, 5, 3, 1, 2, 1]])
tensor([1.0000, 0.5000, 0.2500, 0.2500, 1.0000, 1.0000, 0.5000, 0.5000, 1.0000])
tensor([[37290]])


In [140]:
test_dataset = YooChooseDataset(test_d)
test_loader = DataLoader(test_dataset, batch_size=1)
batch = next(iter(test_loader))

In [136]:
batch

Batch(batch=[1], edge_index=[0], edge_weights=[0], x=[1, 37821], y=[1])

In [None]:
# train_loader, val_loader, test_loader
train_loader = DataLoader(train_dataset, batch_size=2048)

## Construct Model

In [None]:
class SRGNN(nn.Module):
    def __init__(self, h_dim, n_otp):
        

In [None]:
import csv
with open('../data/raw/yoochoose-clicks.dat', 'r') as f:
    reader = csv.DictReader(f, delimiter=',')
    for i, data in enumerate(reader):
        print(data)
        if i == 4:
            break

In [None]:
buys.info()