In [28]:
import math
import random
import numpy as np
import scipy.sparse as sp
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import KFold

from copy import deepcopy

from gensim.models import Word2Vec

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [29]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [30]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

# 데이터 전처리

In [31]:
class GNNMakeDataset():
    def __init__(self, DATA_PATH):
        self.preporcessing(DATA_PATH)
        self.oof_user_set = self.split_data()
    
    def get_oof_data(self, oof):
        val_user_list = self.oof_user_set[oof]
        group_df = self.all_df.groupby('userID')
        
        train = []
        valid = []

        for userID, df in group_df:
            if userID in val_user_list:
                trn_df = df.iloc[:-1, :]
                val_df = df.iloc[-1:, :]
                train.append(trn_df)
                valid.append(val_df)
            else:
                train.append(df)

        train = pd.concat(train).reset_index(drop = True)
        valid = pd.concat(valid).reset_index(drop = True)
        
        return train, valid
    
    def get_test_data(self):
        return self.test_df

    def split_data(self):
        user_list = self.all_df['userID'].unique().tolist()
        oof_user_set = {}
        kf = KFold(n_splits = 5, random_state = 22, shuffle = True)
        for idx, (train_user, valid_user) in enumerate(kf.split(user_list)):
            oof_user_set[idx] = valid_user.tolist()
        
        return oof_user_set

    def preporcessing(self, DATA_PATH):
        dtype = {
                'userID': 'int16',
                'answerCode': 'int8',
                'KnowledgeTag': 'int16'
        }
            
        train_df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
        train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

        test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
        test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

        all_df = pd.concat([train_df, test_df]).sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
        all_df['userID-assessmentItemID'] = all_df['userID'].astype(str) + '-' + all_df['assessmentItemID'].astype(str)
        all_df = all_df[~(all_df.duplicated('userID-assessmentItemID', keep='last'))].reset_index(drop=True)
        
        assessmentItemID2idx = {}
        idx2assessmentItemID = {}

        for idx, assessmentItemID in enumerate(all_df['assessmentItemID'].unique().tolist()):
            assessmentItemID2idx[assessmentItemID] = idx
            idx2assessmentItemID[idx] = assessmentItemID
        
        all_df['assessmentItemID2idx'] = all_df['assessmentItemID'].apply(lambda x : assessmentItemID2idx[x])

        self.num_user, self.num_item = all_df['userID'].nunique(), len(assessmentItemID2idx)
        self.assessmentItemID2idx, self.idx2assessmentItemID = assessmentItemID2idx, idx2assessmentItemID
        self.adj_mat = self.generate_adj_matrix(self.generate_dok_matrix(all_df))
        self.all_df = all_df[all_df['answerCode'] != -1].reset_index(drop=True)
        self.test_df = all_df[all_df['answerCode'] == -1].reset_index(drop=True)

    def generate_dok_matrix(self, df):
        R = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        group_df = df.groupby('userID')
        for userID, g_df in group_df:
            items = g_df['assessmentItemID2idx'].tolist()
            R[userID, items] = 1.0
        
        return R
    
    def generate_adj_matrix(self, R):
        adj_mat = sp.dok_matrix((self.num_user + self.num_item, self.num_user + self.num_item), dtype=np.float32)
        adj_mat = adj_mat.tolil() # to_list
        R = R.tolil()

        adj_mat[:self.num_user, self.num_user:] = R
        adj_mat[self.num_user:, :self.num_user] = R.T
        adj_mat = adj_mat.todok() # to_dok_matrix

        def normalized_adj_single(adj):
            rowsum = np.array(adj.sum(1))
            d_inv = np.power(rowsum, -.5).flatten()  
            d_inv[np.isinf(d_inv)] = 0.
            d_mat_inv = sp.diags(d_inv)
            norm_adj = d_mat_inv.dot(adj).dot(d_mat_inv)

            return norm_adj.tocoo()

        adj_mat = normalized_adj_single(adj_mat)
        return adj_mat.tocsr()

In [32]:
class GNNCustomDataset(Dataset):
    def __init__(self, df):

        self.users = df['userID'].tolist()
        self.items = df['assessmentItemID2idx'].tolist()
        self.targets = df['answerCode'].tolist()


    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        item = self.items[idx]
        target = self.targets[idx]

        return {
            'user' : user, 
            'item' : item, 
            'target' : target,
            }

def make_collate_fn(samples):
    
    users = []
    items = []
    targets = []

    for sample in samples:
        users += [sample['user']]
        items += [sample['item']]
        targets += [sample['target']]

    return {
        'users' : torch.tensor(users, dtype = torch.long),
        'items' : torch.tensor(items, dtype = torch.long),
        'targets' : torch.tensor(targets, dtype = torch.float32),
    }

# 모델

In [33]:
class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, emb_dim, n_layers, node_dropout, adj_mtx):
        super().__init__()

        # initialize Class attributes
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.n_users = n_users
        self.n_items = n_items
        self.emb_dim = emb_dim
        self.graph = self._convert_sp_mat_to_sp_tensor(adj_mtx)
        self.n_layers = n_layers
        self.node_dropout = node_dropout

        # Initialize weights
        self.weight_dict = self._init_weights()
        print("Weights initialized.")

    # initialize weights
    def _init_weights(self):
        print("Initializing weights...")
        weight_dict = nn.ParameterDict()

        initializer = torch.nn.init.xavier_uniform_
        
        weight_dict['user_embedding'] = nn.Parameter(initializer(torch.empty(self.n_users, self.emb_dim).to(self.device)))
        weight_dict['item_embedding'] = nn.Parameter(initializer(torch.empty(self.n_items, self.emb_dim).to(self.device)))
           
        return weight_dict

    # convert sparse matrix into sparse PyTorch tensor
    def _convert_sp_mat_to_sp_tensor(self, X):
        """
        Convert scipy sparse matrix to PyTorch sparse matrix

        Arguments:
        ----------
        X = Adjacency matrix, scipy sparse matrix
        """
        coo = X.tocoo().astype(np.float32)
        i = torch.LongTensor(np.mat([coo.row, coo.col]))
        v = torch.FloatTensor(coo.data)
        res = torch.sparse.FloatTensor(i, v, coo.shape).to(self.device)
        return res

    # apply node_dropout
    def _droupout_sparse(self, X):
        """
        Drop individual locations in X
        
        Arguments:
        ---------
        X = adjacency matrix (PyTorch sparse tensor)
        dropout = fraction of nodes to drop
        noise_shape = number of non non-zero entries of X
        """
        node_dropout_mask = ((self.node_dropout) + torch.rand(X._nnz())).floor().bool().to(self.device)
        i = X.coalesce().indices()
        v = X.coalesce()._values()
        i[:,node_dropout_mask] = 0
        v[node_dropout_mask] = 0
        X_dropout = torch.sparse.FloatTensor(i, v, X.shape).to(X.device)

        return  X_dropout.mul(1/(1-self.node_dropout))

    def forward(self, input):
        """
        Computes the forward pass
        
        Arguments:
        ---------
        u = user
        i = item (user interacted with item)
        """
        # apply drop-out mask
        graph = self._droupout_sparse(self.graph) if self.node_dropout > 0 else self.graph
        ego_embeddings = torch.cat([self.weight_dict['user_embedding'], self.weight_dict['item_embedding']], 0)
        final_embeddings = [ego_embeddings]

        for k in range(self.n_layers):
            ego_embeddings = torch.sparse.mm(graph, final_embeddings[k])
            final_embeddings.append(ego_embeddings)                                       

        final_embeddings = torch.stack(final_embeddings, dim=1)
        final_embeddings = torch.mean(final_embeddings, dim=1)
        
        u_final_embeddings, i_final_embeddings = final_embeddings.split([self.n_users, self.n_items], 0)

        self.u_final_embeddings = nn.Parameter(u_final_embeddings)
        self.i_final_embeddings = nn.Parameter(i_final_embeddings)
        
        u_emb = u_final_embeddings[input['users']] # user embeddings
        i_emb = i_final_embeddings[input['items']] # item embeddings
        
        output = torch.sum(torch.mul(u_emb, i_emb), dim = 1).sigmoid()

        return output
    
    def predict(self, input):
        u_emb = self.u_final_embeddings[input['users']]
        i_emb = self.i_final_embeddings[input['items']]

        output = torch.sum(torch.mul(u_emb, i_emb), dim = 1).sigmoid()

        return output

# 학습 함수

In [34]:
from sklearn.metrics import roc_auc_score

def train(model, data_loader, criterion, optimizer):
    model.train()
    loss_val = 0

    target = []
    pred = []

    for batch, input in enumerate(data_loader):

        optimizer.zero_grad()

        output = model(input)
        loss = criterion(output, input['targets'].to(device))

        loss.backward()
        optimizer.step()

        loss_val += loss.item()

        target.extend(input['targets'].cpu().numpy().tolist())
        pred.extend(output.detach().cpu().numpy().tolist())

        if batch % 100 == 0:
            print(f'{batch + 1}/{len(data_loader)} Loss : {loss_val / (batch + 1):.5f} Roc-Auc: {roc_auc_score(target, pred):.5f}')

    loss_val /= len(data_loader)
    roc_auc = roc_auc_score(target, pred)

    return loss_val, roc_auc

def evaluate(model, data_loader, criterion):
    model.eval()

    loss_val = 0

    target = []
    pred = []

    with torch.no_grad():
        for input in data_loader:
            
            output = model.predict(input)
            
            loss = criterion(output, input['targets'].to(device))
            loss_val += loss.item()

            target.extend(input['targets'].cpu().numpy().tolist())
            pred.extend(output.cpu().numpy().tolist())

    loss_val /= len(data_loader)
    roc_auc = roc_auc_score(target, pred)

    return loss_val, roc_auc


def predict(model, data_loader):
    model.eval()

    pred = []

    with torch.no_grad():
        for input in data_loader:
            output = model.predict(input)
            pred.extend(output.cpu().numpy().tolist())
    
    return pred

# 학습

In [35]:
batch_size = 5000
epochs = 5
lr = 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'

emb_dim = 64
n_layers = 2
node_dropout = 0.2

num_workers = 8

DATA_PATH = '/opt/ml/input/data'
MODEL_PATH = '/opt/ml/model'
SUBMISSION_PATH = '/opt/ml/submission'

model_name = 'LightGCN'
submission_name = 'LightGCN.csv'

In [36]:
if not os.path.isdir(MODEL_PATH):
    os.mkdir(MODEL_PATH)

In [37]:
if not os.path.isdir(SUBMISSION_PATH):
    os.mkdir(SUBMISSION_PATH)

In [11]:
make_dataset = GNNMakeDataset(DATA_PATH = DATA_PATH)

In [38]:
oof = 0

train_df, valid_df = make_dataset.get_oof_data(oof)

In [39]:
seed_everything(22 + oof)

train_dataset = GNNCustomDataset(df = train_df)
train_data_loader = DataLoader(
    train_dataset, 
    batch_size = batch_size, 
    shuffle = True, 
    drop_last = False,
    collate_fn = make_collate_fn,
    num_workers = num_workers)


valid_dataset = GNNCustomDataset(df = valid_df)
valid_data_loader = DataLoader(
    valid_dataset,
    batch_size = batch_size,
    shuffle = False, 
    drop_last = False,
    collate_fn = make_collate_fn,
    num_workers = num_workers)

model = LightGCN(
    n_users = make_dataset.num_user,
    n_items = make_dataset.num_item,
    emb_dim = emb_dim,
    n_layers = n_layers,
    node_dropout = node_dropout,
    adj_mtx = make_dataset.adj_mat,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.BCELoss()

Initializing weights...
Weights initialized.


In [40]:
best_epoch = 0
best_train_loss = 0
best_train_roc_auc = 0
best_valid_loss = 0
best_valid_roc_auc = 0

for epoch in range(1, epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss, train_roc_auc = train(model = model, data_loader = train_data_loader, criterion = criterion, optimizer = optimizer)
        valid_loss, valid_roc_auc = evaluate(model = model, data_loader = valid_data_loader, criterion = criterion)
        if best_valid_roc_auc < valid_roc_auc:
            best_epoch = epoch
            best_train_loss = train_loss
            best_train_roc_auc = train_roc_auc
            best_valid_loss = valid_loss
            best_valid_roc_auc = valid_roc_auc
            torch.save(model.state_dict(), os.path.join(MODEL_PATH, f'oof_{oof}_' + model_name + '.pt'))

        tbar.set_description(f'OOF-{oof}| Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| Train Roc-Auc: {train_roc_auc:.5f}| Valid loss: {valid_loss:.5f}| Valid Roc-Auc: {valid_roc_auc:.5f}|')

print(f'BEST OOF-{oof}| Epoch: {best_epoch:3d}| Train loss: {best_train_loss:.5f}| Train Roc-Auc: {best_train_roc_auc:.5f}| Valid loss: {best_valid_loss:.5f}| Valid Roc-Auc: {best_valid_roc_auc:.5f}|')

  0%|          | 0/1 [00:00<?, ?it/s]

1/495 Loss : 0.69315 Roc-Auc: 0.49050
101/495 Loss : 0.68088 Roc-Auc: 0.53704
201/495 Loss : 0.64748 Roc-Auc: 0.56541
301/495 Loss : 0.62851 Roc-Auc: 0.60278
401/495 Loss : 0.61600 Roc-Auc: 0.63276


OOF-0| Epoch:   1| Train loss: 0.60748| Train Roc-Auc: 0.65418| Valid loss: 0.66664| Valid Roc-Auc: 0.72425|: 100%|██████████| 1/1 [13:01<00:00, 781.11s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

1/495 Loss : 0.55804 Roc-Auc: 0.77182
101/495 Loss : 0.56215 Roc-Auc: 0.76582
201/495 Loss : 0.55972 Roc-Auc: 0.76757
301/495 Loss : 0.55787 Roc-Auc: 0.76906
401/495 Loss : 0.55587 Roc-Auc: 0.77051


OOF-0| Epoch:   2| Train loss: 0.55443| Train Roc-Auc: 0.77150| Valid loss: 0.64266| Valid Roc-Auc: 0.73710|: 100%|██████████| 1/1 [12:59<00:00, 779.34s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

1/495 Loss : 0.53916 Roc-Auc: 0.78395
101/495 Loss : 0.54373 Roc-Auc: 0.78050
201/495 Loss : 0.54325 Roc-Auc: 0.77990
301/495 Loss : 0.54228 Roc-Auc: 0.78048
401/495 Loss : 0.54207 Roc-Auc: 0.78028


OOF-0| Epoch:   3| Train loss: 0.54151| Train Roc-Auc: 0.78032| Valid loss: 0.63223| Valid Roc-Auc: 0.73704|: 100%|██████████| 1/1 [12:59<00:00, 779.78s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

1/495 Loss : 0.54683 Roc-Auc: 0.78058
101/495 Loss : 0.53670 Roc-Auc: 0.78430
201/495 Loss : 0.53644 Roc-Auc: 0.78403
301/495 Loss : 0.53616 Roc-Auc: 0.78405
401/495 Loss : 0.53570 Roc-Auc: 0.78427


OOF-0| Epoch:   4| Train loss: 0.53533| Train Roc-Auc: 0.78449| Valid loss: 0.62659| Valid Roc-Auc: 0.73761|: 100%|██████████| 1/1 [13:02<00:00, 782.92s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

1/495 Loss : 0.53251 Roc-Auc: 0.79488
101/495 Loss : 0.53144 Roc-Auc: 0.78847
201/495 Loss : 0.53137 Roc-Auc: 0.78859
301/495 Loss : 0.53119 Roc-Auc: 0.78864
401/495 Loss : 0.53067 Roc-Auc: 0.78911


OOF-0| Epoch:   5| Train loss: 0.53049| Train Roc-Auc: 0.78923| Valid loss: 0.62267| Valid Roc-Auc: 0.73849|: 100%|██████████| 1/1 [12:58<00:00, 778.49s/it]

BEST OOF-0| Epoch:   5| Train loss: 0.53049| Train Roc-Auc: 0.78923| Valid loss: 0.62267| Valid Roc-Auc: 0.73849|



