In [1]:
import time
import math
import os
import random

import torch
import torch.nn as nn
import pandas as pd
import pickle
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch.nn.functional as F
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, StratifiedKFold

from brc_pytorch.layers import BistableRecurrentCell, NeuromodulatedBistableRecurrentCell
from brc_pytorch.layers import MultiLayerBase

from pytorch_lamb import Lamb
from adahessian import *
import gc

In [11]:
with open('./dataset/all_transaction_v4.pkl', 'rb') as f:
    all_transaction = pickle.load(f)

In [12]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [13]:
def clear_gpu(num):
    with torch.cuda.device(f'cuda:{num}'):
        torch.cuda.empty_cache()

In [14]:
def ndcg_k(batch_proba, batch_label, batch_isnan, topk=3):
    '''
    batch_proba = (b, m, 16)
    batch_label = (b, m, 16)
    batch_isnan = (b, m)
    '''
    batch_proba = batch_proba.cpu()
    batch_label = batch_label.cpu()
    batch_isnan = batch_isnan.cpu()
    
    # get topk indexes
    batch_pred_idx = batch_proba.argsort(dim=-1, descending=True)[:, :, :topk]
    batch_real_idx = batch_label.argsort(dim=-1, descending=True)[:, :, :topk]
    
    # get num of dcg and idcg with indexes
    bs = batch_proba.shape[0]
    ms = batch_proba.shape[1]
    batch_indexes = torch.repeat_interleave(torch.repeat_interleave(torch.arange(bs).unsqueeze(-1).unsqueeze(-1), topk, dim=-1), ms, dim=1)
    month_indexes = torch.repeat_interleave(torch.repeat_interleave(torch.arange(ms).unsqueeze(-1).unsqueeze(0), topk, dim=-1), bs, dim=0)
    
    batch_pred_amt = batch_label[batch_indexes, month_indexes, batch_pred_idx]
    batch_real_amt = batch_label[batch_indexes, month_indexes, batch_real_idx]

    # compute ndcg in shape (batch(user), month)
    denum = torch.log2((torch.arange(1, topk+1) + 1.0)).unsqueeze(0).unsqueeze(0)
    denum = torch.repeat_interleave(torch.repeat_interleave(denum, ms, dim=1), bs, dim=0)
    bm_dcg = (batch_pred_amt / denum).sum(dim=-1)    
    bm_idcg = (batch_real_amt / denum).sum(dim=-1)
    bm_ndcg = bm_dcg / bm_idcg
    
    # there will be nan months, because user have transactions not in 16 categories
    nan_month = torch.isnan(bm_ndcg)
    
    # replace ndcg of months with no transaction with 0
    not_avail_months = batch_isnan + nan_month
    bm_ndcg[not_avail_months] = 0.0
    
    #bm_ndcg[batch_isnan] = 0.0
    #bm_ndcg[nan_month] = 0.0
    
    # compute each user ndcg average by month, result in (b, )
    #avail_months = (batch_isnan != True).sum(dim=-1) - nan_month.sum(dim=-1)
    avail_months = (~not_avail_months).sum(dim=-1)
    avail_months_mask = (avail_months == 0.0)
    avail_months[avail_months_mask] = 1.0
    b_ndcg = bm_ndcg.sum(dim=-1) / avail_months

    return b_ndcg[~avail_months_mask]

In [15]:
def evaluation(model, loader, topk=3):
    model.eval()
    
    total_loss = []
    ndcgs = []
    total_batches = len(loader)
    
    with torch.no_grad():
        for batch_iter, (ids, xs, cs, ys) in enumerate(loader):
            batch_proba, batch_label, batch_isnan, loss = model(xs, cs, ys)            
            b_ndcg = ndcg_k(batch_proba, batch_label, batch_isnan, topk=topk)
            total_loss.append(loss.item())
            ndcgs.extend(b_ndcg.tolist())
                
            #with torch.cuda.device(f'{model.device.type}:{model.device.index}'):
            #    torch.cuda.empty_cache()
        
    avg_loss = np.mean(total_loss)
    avg_ndcg = np.mean(ndcgs)
    
    print('********* evaluation *********')
    print(f'avg loss: {avg_loss}')
    print(f'avg ndcg: {avg_ndcg}')
    print('******************************')
    
    return avg_loss, avg_ndcg


def train(model, opts, ids, xs, cs, ys):
    for opt in opts:
        opt.zero_grad()
    _, _, _, loss = model(xs, cs, ys)
    loss.backward()
    
    for opt in opts:
        opt.step()
    return loss.item()


def train_iter(model, train_dl, valid_dl, opts, print_ratio=0.1):
    start = time.time()
    train_dl.dataset.change_state('all_train', shuffle=True)
    #train_dl.dataset.change_state('train', shuffle=True)
    #train_dl.dataset.change_state('valid', shuffle=True)
    n_iters = len(train_dl)

    print_every = int(n_iters*print_ratio)
    if print_every == 0:
        print_every = 1
        
    print_loss_total = 0
    model.train()
    
    for batch_iter, (ids, xs, cs, ys) in enumerate(train_dl, 1):
        loss = train(model, opts, ids, xs, cs, ys)
        print_loss_total += loss
        
        if (batch_iter+1) % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f'{timeSince(start, batch_iter/n_iters)} ({int(batch_iter)}, {int(batch_iter/n_iters * 100)}%) loss:{print_loss_avg:.4f}')
        
        #with torch.cuda.device(f'{model.device.type}:{model.device.index}'):
        #    torch.cuda.empty_cache()
    
    print('> train')
    valid_dl.dataset.change_state('all_train', shuffle=False)
    #valid_dl.dataset.change_state('train', shuffle=False)
    #valid_dl.dataset.change_state('valid', shuffle=False)
    _ = evaluation(model, valid_dl)
   
    #print('> valid')
    #valid_dl.dataset.change_state('all_valid', shuffle=False)
    #valid_dl.dataset.change_state('valid', shuffle=False)
    #valid_dl.dataset.change_state('train', shuffle=False)
    #_ = evaluation(model, valid_dl)
    
    if scheduler is not None:
        scheduler.step(valid_loss)    

In [16]:
def get_dataloader(dataset, batch_size=256, shuffle=False, drop_last=False):
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, 
                                       drop_last=drop_last, collate_fn=None)

In [17]:
class TransactionDataset():
    def __init__(self, fill_last_transaction=False):
        self.train_valid_split_index = 19
        self.max_seq_len = 24
        self.all_data = self.load_data()
        self.fold_indexes = self.split_cross_validation()
        self.fill_last_transaction = fill_last_transaction
        self.curr_fold = 0
        self.state = 'train'
    
    def change_fold(self, k):
        assert 0 <= k < 5, 'fold out of range'
        self.curr_fold = k
    
    def change_state(self, state, shuffle=False):
        self.state = state
        if shuffle:
            random.shuffle(self.fold_indexes[self.curr_fold][state])
        
    def load_data(self, ):
        global all_transaction
        
        all_data = []
        for i, (user_id, user_transaction) in enumerate(all_transaction.items()):
            if i % 10000 == 0:
                print('training_data: ', i)
            
            user_x = np.zeros((24, 49, 2))
            user_c = np.zeros((24, 15))
            #user_c = np.zeros((24, 49, 15))
            for m in user_transaction:
                for t in m:                    
                    user_id, month, shop_tag, count, amount, \
                    card_1, card_2, card_3, card_4, card_5, \
                    card_6, card_7, card_8, card_9, card_10, \
                    card_11, card_12, card_13, card_14, other_card = t
                    
                    count = abs(count)
                    month = int(month)-1
                    shop_tag_idx = (49 if shop_tag == 'other' else int(shop_tag))-1
                    user_x[month][shop_tag_idx][0] += amount
                    user_x[month][shop_tag_idx][1] += count
                    #user_c[month][shop_tag_idx] = [card_1, card_2, card_3, card_4, card_5, card_6, card_7, card_8, card_9, card_10,
                    #                               card_11, card_12, card_13, card_14, other_card]
                    
            user_y = user_x[1:, :, 0]
            all_data.append((user_id, user_x, user_c, user_y))
            
        # shuffle only once, and rerandomize
        random.seed(0)
        random.shuffle(all_data)
        random.seed()
        
        return all_data 
    
    def split_cross_validation(self, ):
        kf = KFold(n_splits=5, shuffle=False)
        fold_indexes = {}
        for curr_fold, (tr_idx, va_idx) in enumerate(kf.split(np.arange(len(self.all_data)))):
            fold_indexes[curr_fold] = {
                'all_train': np.arange(len(self.all_data)).tolist(),
                'all_valid': np.arange(len(self.all_data)).tolist(),
                'train': tr_idx.tolist(),
                'valid': va_idx.tolist(),
                'test': np.arange(len(self.all_data)).tolist()
            }
        return fold_indexes
        
    def __len__(self, ):    
        return len(self.fold_indexes[self.curr_fold][self.state])
        
    def __getitem__(self, idx):
        curr_index = self.fold_indexes[self.curr_fold][self.state][idx]
        curr_id, curr_x, curr_c, curr_y = self.all_data[curr_index]
        if self.state != 'test':
            curr_x = curr_x[:-1, :, :]
            curr_c = curr_c[:-1, :]
            
            '''
            if 'train' in self.state:
                curr_x = curr_x[:self.train_valid_split_index, :, :]
                curr_c = curr_c[:self.train_valid_split_index, :]
                curr_y = curr_y[:self.train_valid_split_index, :]
                
            elif 'valid' in self.state:
                curr_y = curr_y[self.train_valid_split_index:, :]
            '''
            
        return curr_id, curr_x, curr_c, curr_y

In [3]:
class RNNRecommend(nn.Module):
    def __init__(self, hidden_dim=256, num_layers=1, fillna=False, amount_only=False, topk_loss=3, device=torch.device('cpu')):
        super(RNNRecommend, self).__init__()
        self.device = device        
        self.full_item_size = 49
        self.input_size = (49 if amount_only else 49*2)# + 15
        self.hidden_dim = hidden_dim
        self.real_item = torch.tensor([2, 6, 10, 12, 13, 15, 18, 19, 21, 22, 25, 26, 36, 37, 39, 48]) - 1
        self.amount_only = amount_only
        self.topk = topk_loss
        
        self.rnn_input_size = self.input_size
        self.rnn_layer = nn.GRU(input_size=self.rnn_input_size, hidden_size=hidden_dim, 
                                num_layers=num_layers, batch_first=True)       
        
        self.linear_layer = nn.Sequential(
                                nn.Linear(hidden_dim, hidden_dim//4),
                                nn.ReLU(),
                                nn.Linear(hidden_dim//4, self.full_item_size),
                                nn.Softmax(dim=-1)
        )
        
        self.loss_fn = nn.BCELoss()
    
    def calc_cos_loss(self, x, y):
        cossim = 1.0-torch.cosine_similarity(x.softmax(dim=-1), y, dim=-1)
        return cossim.mean()
    
    def calc_bce_loss(self, x, y):
        if self.topk > 0:
            bs = y.shape[0]
            ms = y.shape[1]
            bi = torch.repeat_interleave(torch.repeat_interleave(torch.arange(bs).unsqueeze(-1).unsqueeze(-1), self.topk, dim=-1), ms, dim=1)
            mi = torch.repeat_interleave(torch.repeat_interleave(torch.arange(ms).unsqueeze(-1).unsqueeze(0), self.topk, dim=-1), bs, dim=0)
            topki = y.topk(self.topk, dim=-1)[1]
            y = y[bi, mi, topki]
            x = x[bi, mi, topki]
        
        loss = self.loss_fn(x, y)
#         y /= (y.sum(-1).unsqueeze(-1))
#         y[torch.isnan(y)] = 1/16
#         loss = (-(y * x.log_softmax(dim=-1)).sum(-1)).mean()        
        return loss
        
    def calc_rank_loss(self, x, y, margin=1.0):
        paired_x = (x.unsqueeze(3) - x.unsqueeze(2))
        paired_y = (y.unsqueeze(3) - y.unsqueeze(2))
        paired_y = (paired_y > 0)*1.0
        paired_y[paired_y == 0] = -1.0
        return F.relu(-paired_y*paired_x+margin).mean()
    
    def data_preprocess(self, x, y=None, c=None):
        px = (x / x.sum(dim=2).unsqueeze(2)).float().to(self.device)
        px[torch.isnan(px)] = 1/49

        if self.amount_only:
            px = px[:, :, :, 0]
        else:
            px = px.reshape(px.shape[0], px.shape[1], -1)
        result = (px, )
        
        if c is not None:
            pc = (c / c.sum(dim=-1).unsqueeze(-1)).float().to(self.device)
            pc[torch.isnan(pc)] = 1/15
            result += (pc, )
        
        if y is not None:
            py = (y / y.sum(dim=2).unsqueeze(2)).float().to(self.device) # (b, m, 49)
            py_nan = torch.isnan(py.sum(dim=-2).unsqueeze(-2))
            py_isnan = torch.isnan(py.sum(dim=-1))
            py[torch.isnan(py)] = 1/49
            result += (py, py_isnan, )
        return result
    
    def forward(self, x, c, y):
        px, py, py_isnan = self.data_preprocess(x, y=y)
        #px = torch.cat((px, c.float().to(self.device)), dim=-1)
        
        # model
        rnn_output, _ = self.rnn_layer(px)
        linear_output = self.linear_layer(rnn_output)
        
        # calc loss
        bce_loss = self.calc_bce_loss(linear_output[:, -py.shape[1]:, self.real_item], py[:, :, self.real_item])
        #rank_loss = self.calc_rank_loss(linear_output[:, :, self.real_item], py[:, :, self.real_item], margin=1.0)
        #loss = bce_loss# + rank_loss
        #loss = self.calc_cos_loss(linear_output[:, -py.shape[1]:, self.real_item], py[:, :, self.real_item])
        
        return linear_output[:, -py.shape[1]:, self.real_item], py[:, :, self.real_item], py_isnan, bce_loss

## Main

In [19]:
full_dataset = TransactionDataset()

training_data:  0
training_data:  10000
training_data:  20000
training_data:  30000
training_data:  40000
training_data:  50000
training_data:  60000
training_data:  70000
training_data:  80000
training_data:  90000
training_data:  100000
training_data:  110000
training_data:  120000
training_data:  130000
training_data:  140000
training_data:  150000
training_data:  160000
training_data:  170000
training_data:  180000
training_data:  190000
training_data:  200000
training_data:  210000
training_data:  220000
training_data:  230000
training_data:  240000
training_data:  250000
training_data:  260000
training_data:  270000
training_data:  280000
training_data:  290000
training_data:  300000
training_data:  310000
training_data:  320000
training_data:  330000
training_data:  340000
training_data:  350000
training_data:  360000
training_data:  370000
training_data:  380000
training_data:  390000
training_data:  400000
training_data:  410000
training_data:  420000
training_data:  430000
tr

In [11]:
#full_dataset.train_valid_split_index = 22

In [43]:
train_dl = get_dataloader(full_dataset, batch_size=256, shuffle=False, drop_last=True)
valid_dl = get_dataloader(full_dataset, batch_size=1024, shuffle=False, drop_last=False)
test_dl = get_dataloader(full_dataset, batch_size=1024, shuffle=False, drop_last=False)

In [6]:
#clear_gpu(1)
avail_device = torch.device('cpu')#torch.device('cuda:1')#
model = RNNRecommend(hidden_dim=512, num_layers=1, fillna=True, amount_only=False, 
                     topk_loss=-1, device=avail_device).to(avail_device)

model

RNNRecommend(
  (rnn_layer): GRU(98, 512, batch_first=True)
  (linear_layer): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=49, bias=True)
    (3): Softmax(dim=-1)
  )
  (loss_fn): BCELoss()
)

In [128]:
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = None

In [129]:
curr_fold = 0
train_dl.dataset.change_fold(curr_fold)
valid_dl.dataset.change_fold(curr_fold)
test_dl.dataset.change_fold(curr_fold)

## Training

In [130]:
total_epoch = 10
for i in range(total_epoch):
    print('===== epoch {} ====='.format(i+1))
    train_iter(model, train_dl, valid_dl, [opt], print_ratio=0.1)
    torch.save(model.state_dict(), f'./model_ckpt/abs_neg_cnt_celoss_yrenorm_all_data_e{i}.ckpt')

===== epoch 1 =====
0m 3s (- 0m 35s) (194, 9%) loss:2.2974
0m 7s (- 0m 30s) (389, 19%) loss:2.1969
0m 11s (- 0m 27s) (584, 29%) loss:2.1793
0m 15s (- 0m 23s) (779, 39%) loss:2.1784
0m 19s (- 0m 19s) (974, 49%) loss:2.1760
0m 23s (- 0m 15s) (1169, 59%) loss:2.1682
0m 27s (- 0m 11s) (1364, 69%) loss:2.1713
0m 31s (- 0m 8s) (1559, 79%) loss:2.1706
0m 35s (- 0m 4s) (1754, 89%) loss:2.1676
0m 39s (- 0m 0s) (1949, 99%) loss:2.1668
> train
********* evaluation *********
avg loss: 2.166196395039315
avg ndcg: 0.6907492405973927
******************************
===== epoch 2 =====
0m 4s (- 0m 40s) (194, 9%) loss:2.1522
0m 8s (- 0m 33s) (389, 19%) loss:2.1673
0m 12s (- 0m 28s) (584, 29%) loss:2.1664
0m 16s (- 0m 24s) (779, 39%) loss:2.1617
0m 19s (- 0m 20s) (974, 49%) loss:2.1609
0m 23s (- 0m 15s) (1169, 59%) loss:2.1635
0m 27s (- 0m 11s) (1364, 69%) loss:2.1664
0m 31s (- 0m 7s) (1559, 79%) loss:2.1618
0m 35s (- 0m 3s) (1754, 89%) loss:2.1623
0m 38s (- 0m 0s) (1949, 99%) loss:2.1606
> train
0m 8s (

In [None]:
#valid_dl.dataset.change_state('valid', shuffle=False)
#evaluation(model, valid_dl)

In [123]:
#torch.save(model.state_dict(), f'./model_ckpt/second_try_fold{curr_fold}_e{total_epoch}.ckpt')
#torch.save(model.state_dict(), f'./model_ckpt/gru2_abs_neg_cnt_all_data_e{total_epoch}.ckpt')
#torch.save(model.state_dict(), f'./model_ckpt/abs_neg_cnt_celoss_yrenorm_all_data_e{total_epoch}.ckpt')

## Predict Test data

In [132]:
test_dl.dataset.change_state('test', shuffle=False)
load_epoch = 5 - 1
model.load_state_dict(torch.load(f'./model_ckpt/abs_neg_cnt_celoss_yrenorm_all_data_e{load_epoch}.ckpt', map_location=avail_device))
model.eval()
topk = 3

all_predict = []
with torch.no_grad():
    for batch_iter, (ids, xs, cs, ys) in enumerate(test_dl):
        px, = model.data_preprocess(xs)
        #px, pc = model.data_preprocess(xs, c=cs)
        #px_amt_c = (px[:, :, :49].unsqueeze(-1) * pc).reshape(-1, 49, pc.shape[-1]).permute(2, 0, 1)
        #px_cnt_c = (px[:, :, 49:].unsqueeze(-1) * pc).reshape(-1, 49, pc.shape[-1]).permute(2, 0, 1)
        
        #fake_batch = px.shape[0]*px.shape[1]
        #_, px_amt_mix = model.card_amt_attn(model.amt_cls_token(torch.zeros(fake_batch, ).long().to(avail_device)), px_amt_c, px_amt_c)
        #_, px_cnt_mix = model.card_cnt_attn(model.cnt_cls_token(torch.zeros(fake_batch, ).long().to(avail_device)), px_cnt_c, px_cnt_c)
        
        #px_mix = torch.cat((px_amt_mix.reshape(px.shape[0], px.shape[1], 49), 
        #                    px_cnt_mix.reshape(px.shape[0], px.shape[1], 49)), dim=-1)
        
        rnn_output, _ = model.rnn_layer(px)
        #rnn_output, _ = model.rnn_layer(px_mix)
        linear_output = model.linear_layer(rnn_output[:, -1, :])[:, model.real_item]
        topk_index = linear_output.argsort(dim=-1, descending=True)[:, :topk]
        topk_shop_tag = model.real_item[topk_index.cpu()] + 1
        batch_predict = torch.cat((ids.long().unsqueeze(-1), topk_shop_tag), dim=-1).tolist()
        all_predict.extend(batch_predict)

In [133]:
df = pd.DataFrame(all_predict, columns=['chid', 'top1', 'top2', 'top3'])

In [134]:
#df.to_csv(f"./submits/basic_brc_fold{curr_fold}_e{total_epoch}.csv", index=False)
#df.to_csv(f"./submits/gru2_abs_neg_cnt_all_data_e{total_epoch}.csv", index=False)
df.to_csv(f"./submits/abs_neg_cnt_celoss_yrenorm_all_data_e{load_epoch}.csv", index=False)

## K-Fold Ensemble

In [None]:
model_names = [f'./model_ckpt/transaction_with_user_emb_fold{i}_e10.ckpt' for i in range(5)]
ensemble_models = []

for name in model_names:
    curr_model = RNNRecommend(hidden_dim=512, num_layers=1, device=avail_device).to(avail_device)
    curr_model.load_state_dict(torch.load(name, map_location=avail_device))
    curr_model.eval()
    ensemble_models.append(curr_model)
    
test_dl.dataset.test()
topk = 3

ensemble_predict = []
with torch.no_grad():
    for batch_iter, (ids, xs, es, ys) in enumerate(test_dl):
        
        model_outs = []
        for curr_model in ensemble_models:
            px = curr_model.data_preprocess(xs)[0]
            pe = curr_model.user_embedding(curr_model.user_encoding(es).reshape(es.shape[0], es.shape[1], -1))
            px = torch.cat((px, pe), dim=-1)
            rnn_output, _ = curr_model.rnn_layer(px)
            linear_output = model.linear_layer(rnn_output[:, -1, :])[:, model.real_item].detach()
            model_outs.append(linear_output)
            
        model_outs = sum(model_outs)/len(ensemble_models)
        topk_index = model_outs.argsort(dim=-1, descending=True)[:, :topk]
        topk_shop_tag = curr_model.real_item[topk_index.cpu()] + 1
        batch_predict = torch.cat((ids.long().unsqueeze(-1), topk_shop_tag), dim=-1).tolist()
        ensemble_predict.extend(batch_predict)

In [None]:
df.to_csv(f"./submits/transaction_with_user_emb_fold_kfe.csv", index=False)