In [27]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import logging
from sklearn.model_selection import KFold

from copy import deepcopy

#from gensim.models import Word2Vec

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [5]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [6]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

## Graph

In [7]:
from datasets import *


In [10]:
seed_everything(22)
use_cuda_if_available = True
    
use_cuda: bool = torch.cuda.is_available() and use_cuda_if_available
device = torch.device("cuda" if use_cuda else "cpu")

In [12]:
data_dir = "/home/minseo/Naver_Ai/data"
train_data, valid_data, test_data, n_node = prepare_dataset(device=device, data_dir=data_dir, return_origin_train=False)

2024-01-20 18:39:16,567 - root - INFO - Train Dataset Info
2024-01-20 18:39:16,568 - root - INFO -  * Num. Users    : 7442
2024-01-20 18:39:16,568 - root - INFO -  * Max. UserID   : 7441
2024-01-20 18:39:16,569 - root - INFO -  * Num. Items    : 9454
2024-01-20 18:39:16,570 - root - INFO -  * Num. Records  : 2187529
2024-01-20 18:39:16,605 - root - INFO - val Dataset Info
2024-01-20 18:39:16,606 - root - INFO -  * Num. Users    : 7441
2024-01-20 18:39:16,606 - root - INFO -  * Max. UserID   : 7440
2024-01-20 18:39:16,607 - root - INFO -  * Num. Items    : 9454
2024-01-20 18:39:16,607 - root - INFO -  * Num. Records  : 288433
2024-01-20 18:39:16,608 - root - INFO - Test Dataset Info
2024-01-20 18:39:16,608 - root - INFO -  * Num. Users    : 744
2024-01-20 18:39:16,609 - root - INFO -  * Max. UserID   : 7439
2024-01-20 18:39:16,609 - root - INFO -  * Num. Items    : 444
2024-01-20 18:39:16,610 - root - INFO -  * Num. Records  : 744


## Model LightSGCN

In [14]:
from typing import Optional, Tuple
import scipy.sparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from lightgcn_sgcn.BPRloss import BPRLoss
from sklearn.metrics import accuracy_score, roc_auc_score
from lightgcn_sgcn.SignedConv import SignedConv
from torch_geometric.utils import (
    coalesce,
    negative_sampling,
    structured_negative_sampling,
)


class SignedGCN(torch.nn.Module):
    r"""The signed graph convolutional network model from the `"Signed Graph
    Convolutional Network" <https://arxiv.org/abs/1808.06354>`_ paper.
    Internally, this module uses the
    :class:`torch_geometric.nn.conv.SignedConv` operator.

    Args:
        in_channels (int): Size of each input sample.
        hidden_channels (int): Size of each hidden sample.
        num_layers (int): Number of layers.
        lamb (float, optional): Balances the contributions of the overall
            objective. (default: :obj:`5`)
        bias (bool, optional): If set to :obj:`False`, all layers will not
            learn an additive bias. (default: :obj:`True`)
    """
    def __init__(
        self,
        in_channels: int,
        hidden_channels: int,
        num_layers: int,
        lamb: float = 5,
        bias: bool = True
    ):
        super().__init__()

        self.in_channels = in_channels
        self.x = nn.Parameter(torch.empty(16896,in_channels), requires_grad=True) ##node개수 하드코딩
        ##lightgcn weighted sum 
        alpha = 1. / (num_layers + 1)
        self.alpha = nn.ParameterList()
        for _ in range(num_layers + 1):
            self.alpha.append(nn.Parameter(torch.tensor([alpha])))
        self.hidden_channels = hidden_channels
        self.num_layers = num_layers
        self.lamb = lamb

        self.conv1 = SignedConv(in_channels, hidden_channels // 2,
                                first_aggr=True)
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers - 1):
            self.convs.append(
                SignedConv(hidden_channels // 2, hidden_channels // 2,
                           first_aggr=False))

        self.lin = torch.nn.Linear(2 * hidden_channels, 1)
        self.dropout = torch.nn.Dropout(0.2)
        self.reset_parameters()

    def reset_parameters(self):
        r"""Resets all learnable parameters of the module."""
        nn.init.xavier_uniform_(self.x)
        self.conv1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        self.lin.reset_parameters()

    def create_spectral_features(
        self,
        pos_edge_index: Tensor,
        neg_edge_index: Tensor,
        num_nodes: Optional[int] = None,
    ) -> Tensor:
        r"""Creates :obj:`in_channels` spectral node features based on
        positive and negative edges.

        Args:
            pos_edge_index (LongTensor): The positive edge indices.
            neg_edge_index (LongTensor): The negative edge indices.
            num_nodes (int, optional): The number of nodes, *i.e.*
                :obj:`max_val + 1` of :attr:`pos_edge_index` and
                :attr:`neg_edge_index`. (default: :obj:`None`)
        """
        from sklearn.decomposition import TruncatedSVD

        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
        N = edge_index.max().item() + 1 if num_nodes is None else num_nodes
        edge_index = edge_index.to(torch.device('cpu'))

        pos_val = torch.full((pos_edge_index.size(1), ), 2, dtype=torch.float)
        neg_val = torch.full((neg_edge_index.size(1), ), 0, dtype=torch.float)
        val = torch.cat([pos_val, neg_val], dim=0)
        row, col = edge_index
        edge_index = torch.cat([edge_index, torch.stack([col, row])], dim=1)
        val = torch.cat([val, val], dim=0)

        edge_index, val = coalesce(edge_index, val, num_nodes=N)
        val = val - 1

        # Borrowed from:
        # https://github.com/benedekrozemberczki/SGCN/blob/master/src/utils.py
        edge_index = edge_index.detach().numpy()
        val = val.detach().numpy()
        A = scipy.sparse.coo_matrix((val, edge_index), shape=(N, N))
        svd = TruncatedSVD(n_components=self.in_channels, n_iter=128)
        svd.fit(A)
        x = svd.components_.T
        return torch.from_numpy(x).to(torch.float).to(pos_edge_index.device)

    def forward(
        self,
        x: Tensor,
        pos_edge_index: Tensor,
        neg_edge_index: Tensor,
    ) -> Tensor:
        """Computes node embeddings :obj:`z` based on positive edges
        :obj:`pos_edge_index` and negative edges :obj:`neg_edge_index`.

        Args:
            x (torch.Tensor): The input node features.
            pos_edge_index (torch.Tensor): The positive edge indices.
            neg_edge_index (torch.Tensor): The negative edge indices.
        """
        x = self.x
        z = self.alpha[0] * self.conv1(x, pos_edge_index, neg_edge_index)
        z = self.dropout(z)
        for i,conv in enumerate(self.convs):
            z =  z + conv(z, pos_edge_index, neg_edge_index) * self.alpha[i+1]
            z = self.dropout(z)
        return z

    def discriminate(self, z: Tensor, edge_index: Tensor) -> Tensor:
        """Given node embeddings :obj:`z`, classifies the link relation
        between node pairs :obj:`edge_index` to be either positive,
        negative or non-existent.

        Args:
            x (torch.Tensor): The input node features.
            edge_index (torch.Tensor): The edge indices.
        """
        value = torch.cat([z[edge_index[0]], z[edge_index[1]]], dim=1)
        value = self.lin(value)
        return torch.sigmoid(value)

    def loss_bce(
        self,
        z: Tensor,
        edge:Tensor
    ) -> Tensor:
        """Computes the overall objective.

        Args:
            z (torch.Tensor): The node embeddings.
            pos_edge_index (torch.Tensor): The positive edge indices.
            neg_edge_index (torch.Tensor): The negative edge indices.
        """
        logit = self.discriminate(z,edge['edge'])
        label = edge['label']
        label = label.view(-1, 1).float()
        bceloss = torch.nn.BCELoss(reduction="mean")
        
        return bceloss(logit,label)

    def loss_bpr(
        self,
        z: Tensor,
        edge:Tensor
    ) -> Tensor:
        """Computes the overall objective.

        Args:
            z (torch.Tensor): The node embeddings.
            pos_edge_index (torch.Tensor): The positive edge indices.
            neg_edge_index (torch.Tensor): The negative edge indices.
        """
        logit = self.discriminate(z,edge['edge'])
        label = edge['label']
        label = label.view(-1, 1).float()
        BprLoss = BPRLoss()
        
        return BprLoss(logit,label)
    
    def test(
        self,
        z: Tensor,
        edge:Tensor,
    ) -> Tuple[float, float]:
        """Evaluates node embeddings :obj:`z` on positive and negative test
        edges by computing AUC and F1 scores.

        Args:
            z (torch.Tensor): The node embeddings.
            pos_edge_index (torch.Tensor): The positive edge indices.
            neg_edge_index (torch.Tensor): The negative edge indices.
        """
        from sklearn.metrics import f1_score, roc_auc_score

        with torch.no_grad():
            logit = self.discriminate(z,edge['edge']).cpu()
        
        label = edge['label'].cpu().numpy()
        acc = accuracy_score(y_true=label, y_pred=logit > 0.5)
        #print(logit)
        auc = roc_auc_score(y_true=label, y_score=logit)
        return auc, acc

    def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({self.in_channels}, '
                f'{self.hidden_channels}, num_layers={self.num_layers})')


# 모델

## model parameter (hyper)

In [30]:
in_channels = 1
hidden_channels=2
num_layer = 3
lamb = 5
bias = True
learning_rate = 1e-5
epochs = 10
num_nodes = n_node

## 모델 생성

In [31]:
model = SignedGCN(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            num_layers=num_layer,
            lamb=lamb,
            bias=True
            )

In [32]:
def edge_split_by_sign(edges):
    pos_from = edges['edge'][0,:][edges['label'] == 1]
    pos_dest = edges['edge'][1,:][edges['label'] == 1]
    pos_edges = torch.stack((pos_from,pos_dest))    
    
    neg_from = edges['edge'][0,:][edges['label'] == 0]
    neg_dest = edges['edge'][1,:][edges['label'] == 0]
    neg_edges = torch.stack((neg_from,neg_dest)) 
    return pos_edges, neg_edges

In [33]:
def train(model: nn.Module, train_data: dict, optimizer: torch.optim.Optimizer, embedding:torch.Tensor):
    model.train()
    optimizer.zero_grad()
    pos_edge, neg_edge = edge_split_by_sign(train_data)
    next_embedding = model(embedding,pos_edge,neg_edge)
    loss = model.loss_bce(next_embedding,train_data)
    # backward
    loss.backward()
    optimizer.step()
    model.eval()
    with torch.no_grad():
        auc,acc = model.test(next_embedding,train_data)
    
    logger.info("TRAIN LOSS : %.4f, Train AUC : %.4f", loss.item(), auc)
    return next_embedding, loss


def validate(valid_data: dict, model: nn.Module, embedding:torch.Tensor):
    model.eval()
    with torch.no_grad():
        pos_edge,neg_edge = edge_split_by_sign(valid_data)
        auc,acc = model.test(embedding,valid_data)
        
    logger.info("VALID AUC : %.4f", auc)
    return auc

In [23]:
optimizer =   optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
best_auc, best_epoch = 0, -1
early_stopping_counter = 0
#print(num_nodes)
input_feature = torch.rand((num_nodes,32),device="cuda")
for e in range(epochs):
    logging.info("Epoch: %s", e)
    # TRAIN
    # optimizer.zero_grad()
    node_embedding, loss = train(model,train_data,optimizer,input_feature)
    
    
    # VALID
    auc = validate(valid_data,model,node_embedding)
    
    
    wandb.log(dict(valid_auc_epoch=auc))
    
    
    if auc > best_auc:
        logger.info("Best model updated AUC from %.4f to %.4f", best_auc, auc)
        best_auc, best_epoch = auc, e
        torch.save(obj= {"model": model.state_dict(), "epoch": e + 1},
                    f=os.path.join(model_dir, f"best_model.pt")) 
        
        with torch.no_grad():
            print("t")
            pred = model.discriminate(node_embedding,edge_index=test_data["edge"])
            pred = pred.flatten().detach().cpu().numpy()
            os.makedirs(name="./submit/", exist_ok=True)
            write_path = os.path.join("./submit/", "submission_t.csv")
            pd.DataFrame({"prediction": pred}).to_csv(path_or_buf=write_path, index_label="id")
            
        
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            break
torch.save(obj={"model": model.state_dict(), "epoch": e + 1},
            f=os.path.join(model_dir, f"last_model.pt"))
logger.info(f"Best Weight Confirmed : {best_epoch+1}'th epoch")

SignedGCN(1, 2, num_layers=3)

# 학습 함수

# 학습

In [9]:
import math
from torch.optim.lr_scheduler import _LRScheduler

class CosineAnnealingWarmUpRestarts(_LRScheduler):
    def __init__(self, optimizer, T_0, T_mult=1, eta_max=0.1, T_up=0, gamma=1., last_epoch=-1):
        if T_0 <= 0 or not isinstance(T_0, int):
            raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
        if T_mult < 1 or not isinstance(T_mult, int):
            raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult))
        if T_up < 0 or not isinstance(T_up, int):
            raise ValueError("Expected positive integer T_up, but got {}".format(T_up))
        self.T_0 = T_0
        self.T_mult = T_mult
        self.base_eta_max = eta_max
        self.eta_max = eta_max
        self.T_up = T_up
        self.T_i = T_0
        self.gamma = gamma
        self.cycle = 0
        self.T_cur = last_epoch
        super(CosineAnnealingWarmUpRestarts, self).__init__(optimizer, last_epoch)
    
    def get_lr(self):
        if self.T_cur == -1:
            return self.base_lrs
        elif self.T_cur < self.T_up:
            return [(self.eta_max - base_lr)*self.T_cur / self.T_up + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.eta_max - base_lr) * (1 + math.cos(math.pi * (self.T_cur-self.T_up) / (self.T_i - self.T_up))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.T_cur = self.T_cur + 1
            if self.T_cur >= self.T_i:
                self.cycle += 1
                self.T_cur = self.T_cur - self.T_i
                self.T_i = (self.T_i - self.T_up) * self.T_mult + self.T_up
        else:
            if epoch >= self.T_0:
                if self.T_mult == 1:
                    self.T_cur = epoch % self.T_0
                    self.cycle = epoch // self.T_0
                else:
                    n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult))
                    self.cycle = n
                    self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1)
                    self.T_i = self.T_0 * self.T_mult ** (n)
            else:
                self.T_i = self.T_0
                self.T_cur = epoch
                
        self.eta_max = self.base_eta_max * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

In [10]:
batch_size = 64
epochs = 10
lr = 1e-05
device = 'cuda' if torch.cuda.is_available() else 'cpu'

emb_size = 64
hidden_units = 128
num_heads = 2 # 2,4,8,16,32
num_layers = 1
dropout_rate = 0.5
num_workers = 8

max_len = 50
window = 10
data_augmentation = False

DATA_PATH = './data'
MODEL_PATH = './model'
SUBMISSION_PATH = './submission'

model_name = 'Transformer-and-LSTM-Encoder-Decoder-each-Embedding-num_heads-2-Scheduler.pt'
submission_name = 'hungry_mental.csv'

In [11]:
if not os.path.isdir(MODEL_PATH):
    os.mkdir(MODEL_PATH)

In [12]:
if not os.path.isdir(SUBMISSION_PATH):
    os.mkdir(SUBMISSION_PATH)

In [13]:
make_dataset = MakeDataset(DATA_PATH = DATA_PATH)

# OOF Ensemble

In [14]:
oof_roc_auc = 0

for oof in make_dataset.oof_user_set.keys():
    train_df, valid_df = make_dataset.get_oof_data(oof)
    
    seed_everything(22 + oof)
    
    train_dataset = CustomDataset(df = train_df,)
    train_data_loader = DataLoader(
        train_dataset, 
        batch_size = batch_size, 
        shuffle = True, 
        drop_last = False,
        collate_fn = train_make_batch,
        num_workers = num_workers)

    valid_dataset = CustomDataset(df = valid_df)
    valid_data_loader = DataLoader(
        valid_dataset, 
        batch_size = 1, 
        shuffle = False, 
        drop_last = False,
        collate_fn = train_make_batch,
        num_workers = num_workers)

    model = SASRec(
        num_assessmentItemID = make_dataset.num_assessmentItemID, 
        num_testId = make_dataset.num_testId,
        num_KnowledgeTag = make_dataset.num_KnowledgeTag,
        num_cols = train_dataset.num_cols,
        cat_cols = train_dataset.cat_cols,
        emb_size = emb_size,
        hidden_units = hidden_units,
        num_heads = num_heads,
        num_layers = num_layers,
        dropout_rate = dropout_rate,
        device = device).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    criterion = nn.BCELoss()
    scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0 = epochs, T_mult = 1, eta_max = 0.01,  T_up = 3, gamma=0.5)

    # pre_emb = Word2Vec.load(os.path.join(MODEL_PATH, 'Word2Vec_Embedding_Model_window_50.model'))

    # assessmentItemID_li = make_dataset.assessmentItemID2idx.keys()

    # with torch.no_grad():
    #     for assessmentItemID in assessmentItemID_li:
    #         idx = make_dataset.assessmentItemID2idx[assessmentItemID]
    #         model.assessmentItemID_emb.weight[idx + 1] = torch.tensor(pre_emb.wv[assessmentItemID]).to(device)

    best_epoch = 0
    best_train_loss = 0
    best_roc_auc = 0

    for epoch in range(1, epochs + 1):
        tbar = tqdm(range(1))
        for _ in tbar:
            train_loss = train(model = model, data_loader = train_data_loader, criterion = criterion, optimizer = optimizer)
            roc_auc = evaluate(model = model, data_loader = valid_data_loader)
            if best_roc_auc < roc_auc:
                best_epoch = epoch
                best_train_loss = train_loss
                best_roc_auc = roc_auc
                torch.save(model.state_dict(), os.path.join(MODEL_PATH, f'oof_{oof}_' + model_name))

            tbar.set_description(f'OOF-{oof}| Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| roc_auc: {roc_auc:.5f}')
            scheduler.step()
    
    print(f'BEST OOF-{oof}| Epoch: {best_epoch:3d}| Train loss: {best_train_loss:.5f}| roc_auc: {best_roc_auc:.5f}')

    oof_roc_auc += best_roc_auc

print(f'Total roc_auc: {oof_roc_auc / len(make_dataset.oof_user_set.keys()):.5f}')

OOF-0| Epoch:   1| Train loss: 0.64794| roc_auc: 0.73556: 100%|██████████| 1/1 [00:34<00:00, 34.71s/it]
OOF-0| Epoch:   2| Train loss: 0.50638| roc_auc: 0.81136: 100%|██████████| 1/1 [00:34<00:00, 34.70s/it]
OOF-0| Epoch:   3| Train loss: 0.48249| roc_auc: 0.81346: 100%|██████████| 1/1 [00:35<00:00, 35.65s/it]
OOF-0| Epoch:   4| Train loss: 0.47489| roc_auc: 0.82409: 100%|██████████| 1/1 [00:36<00:00, 36.16s/it]
OOF-0| Epoch:   5| Train loss: 0.46762| roc_auc: 0.82453: 100%|██████████| 1/1 [00:36<00:00, 36.03s/it]
OOF-0| Epoch:   6| Train loss: 0.46369| roc_auc: 0.82404: 100%|██████████| 1/1 [00:36<00:00, 36.25s/it]
OOF-0| Epoch:   7| Train loss: 0.46031| roc_auc: 0.82319: 100%|██████████| 1/1 [00:36<00:00, 36.91s/it]
OOF-0| Epoch:   8| Train loss: 0.45783| roc_auc: 0.82235: 100%|██████████| 1/1 [00:37<00:00, 37.05s/it]
OOF-0| Epoch:   9| Train loss: 0.45495| roc_auc: 0.82294: 100%|██████████| 1/1 [00:35<00:00, 35.94s/it]
OOF-0| Epoch:  10| Train loss: 0.45274| roc_auc: 0.82272: 100%|█

BEST OOF-0| Epoch:   5| Train loss: 0.46762| roc_auc: 0.82453


OOF-1| Epoch:   1| Train loss: 0.66757| roc_auc: 0.71522: 100%|██████████| 1/1 [00:36<00:00, 36.49s/it]
OOF-1| Epoch:   2| Train loss: 0.50761| roc_auc: 0.80224: 100%|██████████| 1/1 [00:36<00:00, 36.95s/it]
OOF-1| Epoch:   3| Train loss: 0.48298| roc_auc: 0.80575: 100%|██████████| 1/1 [00:37<00:00, 37.98s/it]
OOF-1| Epoch:   4| Train loss: 0.47490| roc_auc: 0.81309: 100%|██████████| 1/1 [00:37<00:00, 37.22s/it]
OOF-1| Epoch:   5| Train loss: 0.46849| roc_auc: 0.82045: 100%|██████████| 1/1 [00:36<00:00, 36.71s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

# 예측

In [None]:
test_df = make_dataset.get_test_data()
test_dataset = CustomDataset(df = test_df)
test_data_loader = DataLoader(
    test_dataset,
    batch_size = 1, 
    shuffle = False, 
    drop_last = False,
    collate_fn = train_make_batch,
    num_workers = num_workers)

pred_list = []

model = SASRec(
    num_assessmentItemID = make_dataset.num_assessmentItemID, 
    num_testId = make_dataset.num_testId,
    num_KnowledgeTag = make_dataset.num_KnowledgeTag,
    num_cols = train_dataset.num_cols,
    cat_cols = train_dataset.cat_cols,
    emb_size = emb_size, 
    hidden_units = hidden_units, 
    num_heads = num_heads, 
    num_layers = num_layers, 
    dropout_rate = dropout_rate, 
    device = device).to(device)

for oof in make_dataset.oof_user_set.keys():
    model.load_state_dict(torch.load(os.path.join(MODEL_PATH, f'oof_{oof}_' + model_name)))
    pred = predict(model = model, data_loader = test_data_loader)
    pred_list.append(pred)

pred_list = np.array(pred_list).mean(axis = 0)

In [None]:
submission = pd.DataFrame(data = np.array(pred_list), columns = ['prediction'])
submission['id'] = submission.index
submission = submission[['id', 'prediction']]
submission.to_csv(os.path.join(SUBMISSION_PATH, 'OOF-Ensemble-' + submission_name), index = False)