# lightGCN

In [1]:
import os

import pandas as pd
import torch
from config import CFG, logging_conf
from lightgcn.datasets import prepare_dataset
from lightgcn.models import build, train
from lightgcn.utils import class2dict, get_logger

from lightgcn.datasets import prepare_dataset
from lightgcn.models import build, inference
from lightgcn.utils import get_logger

  from .autonotebook import tqdm as notebook_tqdm


train

In [2]:
logger = get_logger(logging_conf)
use_cuda = torch.cuda.is_available() and CFG.use_cuda_if_available
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [3]:
def load_data(basepath):
    path1 = os.path.join(basepath, "train_data.csv")
    path2 = os.path.join(basepath, "test_data.csv")
    data1 = pd.read_csv(path1)
    data2 = pd.read_csv(path2)

    data = pd.concat([data1, data2])
    data.drop_duplicates(
        subset=["userID", "assessmentItemID"], keep="last", inplace=True
    )

    return data


def separate_data(data):
    train_data = data[data.answerCode >= 0]
    test_data = data[data.answerCode < 0]

    return train_data, test_data


def indexing_data(data):
    userid, itemid = (
        sorted(list(set(data.userID))),
        sorted(list(set(data.assessmentItemID))),
    )
    n_user, n_item = len(userid), len(itemid)

    userid_2_index = {v: i for i, v in enumerate(userid)}
    itemid_2_index = {v: i + n_user for i, v in enumerate(itemid)}
    id_2_index = dict(userid_2_index, **itemid_2_index)

    return id_2_index


def process_data(data, id_2_index, device):
    edge, label = [], []
    for user, item, acode in zip(data.userID, data.assessmentItemID, data.answerCode):
        uid, iid = id_2_index[user], id_2_index[item]
        edge.append([uid, iid])
        label.append(acode)

    edge = torch.LongTensor(edge).T
    label = torch.LongTensor(label)

    return dict(edge=edge.to(device), label=label.to(device))


In [4]:
def prepare_dataset(device, basepath, verbose=True, logger=None):
    data = load_data(basepath)
    train_data, test_data = separate_data(data)
    id2index = indexing_data(data)
    train_data_proc = process_data(train_data, id2index, device)
    test_data_proc = process_data(test_data, id2index, device)

    # if verbose:
    #     print_data_stat(train_data, "Train", logger=logger)
    #     print_data_stat(test_data, "Test", logger=logger)

    return train_data_proc, test_data_proc, len(id2index), id2index 

In [5]:
# data prepare
train_data, test_data, n_node, edge_index = prepare_dataset(
    device, CFG.basepath, verbose=CFG.loader_verbose, logger=logger.getChild("data")
)

In [6]:
# model build
model = build(
    n_node,
    embedding_dim=CFG.embedding_dim,
    num_layers=CFG.num_layers,
    alpha=CFG.alpha,
    logger=logger.getChild("build"),
    **CFG.build_kwargs
)
model.to(device)

2022-11-28 07:26:38,826 - build - INFO - No load model


LightGCN(16896, 64, num_layers=1)

In [7]:
# model train
train(
    model,
    train_data,
    n_epoch=CFG.n_epoch,
    learning_rate=CFG.learning_rate,
    use_wandb=False,
    weight=CFG.weight_basepath,
    logger=logger.getChild("train"),
)
logger.info("Task Complete")

2022-11-28 07:26:39,109 - train - INFO - Training Started : n_epoch=20
2022-11-28 07:26:39,148 - train - INFO -  * In epoch 0001, loss=0.693, acc=0.528, AUC=0.526
2022-11-28 07:26:39,149 - train - INFO -  * In epoch 0001, loss=0.693, acc=0.528, AUC=0.526, Best AUC
2022-11-28 07:26:39,207 - train - INFO -  * In epoch 0002, loss=0.693, acc=0.538, AUC=0.546
2022-11-28 07:26:39,209 - train - INFO -  * In epoch 0002, loss=0.693, acc=0.538, AUC=0.546, Best AUC
2022-11-28 07:26:39,266 - train - INFO -  * In epoch 0003, loss=0.693, acc=0.549, AUC=0.567
2022-11-28 07:26:39,268 - train - INFO -  * In epoch 0003, loss=0.693, acc=0.549, AUC=0.567, Best AUC
2022-11-28 07:26:39,325 - train - INFO -  * In epoch 0004, loss=0.693, acc=0.560, AUC=0.588
2022-11-28 07:26:39,327 - train - INFO -  * In epoch 0004, loss=0.693, acc=0.560, AUC=0.588, Best AUC
2022-11-28 07:26:39,382 - train - INFO -  * In epoch 0005, loss=0.693, acc=0.586, AUC=0.609
2022-11-28 07:26:39,384 - train - INFO -  * In epoch 0005, lo

inference

In [8]:
logger = get_logger(logging_conf)
use_cuda = torch.cuda.is_available() and CFG.use_cuda_if_available
device = torch.device("cuda" if use_cuda else "cpu")

if not os.path.exists(CFG.output_dir):
    os.makedirs(CFG.output_dir)

In [9]:
pred = inference(model, test_data, logger=logger.getChild("infer"))

In [10]:
pred = pred.detach().cpu().numpy()
pd.DataFrame({"prediction": pred}).to_csv(
        os.path.join(CFG.output_dir, "plus_test_submission.csv"), index_label="id"
)

### lightGCN 임베딩 추출

In [11]:
n_node

16896

In [12]:
model.embedding.weight.shape

torch.Size([16896, 64])

In [13]:
model.embedding.weight

Parameter containing:
tensor([[ 0.0115, -0.0279,  0.0204,  ...,  0.0023, -0.0060, -0.0223],
        [-0.0256, -0.0012,  0.0083,  ..., -0.0217,  0.0143,  0.0082],
        [ 0.0099,  0.0029,  0.0068,  ...,  0.0236,  0.0211, -0.0271],
        ...,
        [-0.0225, -0.0119,  0.0070,  ...,  0.0156,  0.0111, -0.0194],
        [ 0.0111,  0.0186,  0.0226,  ..., -0.0269, -0.0027,  0.0142],
        [-0.0164, -0.0097, -0.0248,  ..., -0.0152, -0.0031, -0.0119]],
       device='cuda:0', requires_grad=True)

In [14]:
embed = model.get_embedding(train_data['edge'])

In [15]:
p1 = embed[edge_index[0]]
p2 = embed[edge_index['A060001001']]
print(p1)

tensor([ 0.0057, -0.0139,  0.0102, -0.0062,  0.0080,  0.0005, -0.0093, -0.0041,
         0.0051,  0.0141, -0.0069,  0.0071, -0.0016,  0.0124,  0.0006, -0.0137,
        -0.0029, -0.0056, -0.0054, -0.0099, -0.0043,  0.0116,  0.0059, -0.0152,
         0.0043, -0.0113,  0.0148, -0.0084, -0.0094,  0.0088,  0.0040,  0.0085,
        -0.0016, -0.0037,  0.0005, -0.0165, -0.0147, -0.0028,  0.0050,  0.0029,
        -0.0117, -0.0131, -0.0002,  0.0041,  0.0038, -0.0166, -0.0069, -0.0012,
         0.0115, -0.0040, -0.0002,  0.0011,  0.0022,  0.0125,  0.0034,  0.0015,
        -0.0017, -0.0129, -0.0115, -0.0134,  0.0015,  0.0012, -0.0030, -0.0112],
       device='cuda:0', grad_fn=<SelectBackward0>)


# Riiid

코드 : 5강 실습 코드

In [19]:
import os
import pandas as pd

# RiiiD 데이터셋 path 설정
RIIID_PATH = "/opt/ml/input/data/"

# 데이터셋 불러오기
train_df = pd.read_csv(os.path.join(RIIID_PATH, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(RIIID_PATH, 'test_data.csv'))
submission_df = pd.read_csv(os.path.join(RIIID_PATH, 'sample_submission.csv'))

In [None]:
# 학습 과정에서 학습 샘플을 생성하기 위해서 필요한 유저별 row_ids를 저장
question_row_ids_by_user_id = train_df.groupby('userID').apply(lambda x: x.index.tolist())
question_row_ids_by_user_id.reset_index().head()

In [None]:
# 학습 과정에서 학습 샘플을 생성하기 위해서 필요한 유저별 시작 row_id를 저장
start_row_id_by_user_id = train_df.groupby('userID').apply(lambda x: x.index[0])
start_row_id_by_user_id.reset_index().head()

In [None]:
# feature 추가
train_df['big_category'] = train_df.testId.map(lambda x:x[2]).astype(int)
train_df['mid_category'] = train_df.testId.map(lambda x: int(x[-3:]))
train_df['problem_num'] = train_df.assessmentItemID.map(lambda x: int(x[-3:]))

# 데이터 타입 변경
train_df['KnowledgeTag'] = train_df['KnowledgeTag'].astype(str)
train_df['big_category'] = train_df['big_category'].astype(str)
train_df['mid_category'] = train_df['mid_category'].astype(str)
train_df['problem_num'] = train_df['problem_num'].astype(str)

In [None]:
cate2id_dict = {}

offset = 0

# assessmentItemID2id
Item2id = dict([(v, i+offset) for i, v in enumerate(train_df['assessmentItemID'].unique())])
cate2id_dict['Item2id'] = Item2id
offset += len(Item2id)

# testId2id
testId2id = dict([(v, i+offset) for i, v in enumerate(train_df['testId'].unique())])
cate2id_dict['testId2id'] = testId2id
offset += len(testId2id)

# KnowledgeTag2id
KnowledgeTag2id = dict([(v, i+offset) for i, v in enumerate(train_df['KnowledgeTag'].unique())])
cate2id_dict['KnowledgeTag2id'] = KnowledgeTag2id
offset += len(KnowledgeTag2id)

# big_category2id
big_category2id = dict([(v, i+offset) for i, v in enumerate(train_df['big_category'].unique())])
cate2id_dict['big_category2id'] = big_category2id
offset += len(big_category2id)
        
# mid_category2id
mid_category2id = dict([(v, i+offset) for i, v in enumerate(train_df['mid_category'].unique())])
cate2id_dict['mid_category2id'] = mid_category2id
offset += len(mid_category2id)

# problem_num2id
problem_num2id = dict([(v, i+offset) for i, v in enumerate(train_df['problem_num'].unique())])
cate2id_dict['problem_num2id'] = problem_num2id
offset += len(problem_num2id)


In [None]:
import numpy as np

# mapping
train_df['assessmentItemID'] = train_df['assessmentItemID'].map(Item2id)
train_df['testId'] = train_df['testId'].map(testId2id)
train_df['KnowledgeTag'] = train_df['KnowledgeTag'].map(KnowledgeTag2id)
train_df['big_category'] = train_df['big_category'].map(big_category2id)
train_df['mid_category'] = train_df['mid_category'].map(mid_category2id)
train_df['problem_num'] = train_df['problem_num'].map(problem_num2id)

In [None]:
# Timestamp 변경하기

from datetime import datetime
import time

def convert_time(s):
    timestamp = time.mktime(
        datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
    )
    return int(timestamp)

train_df["Timestamp"] = train_df["Timestamp"].apply(convert_time)

In [None]:
cont_cols = [ #'Timestamp',
             'answerCode']

cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag', 'big_category', 'mid_category', 'problem_num'] 

train_df[cate_cols] = train_df[cate_cols].astype(np.int16)
# train_df[cont_cols] = train_df[cont_cols].astype(np.float32)  

In [None]:
print(f"훈련 데이터셋 shape : {train_df.shape}")
print(f"category 값들의 총 갯수 : {offset}")
print(f"category feature들의 column 이름 : {cate_cols}")
print(f"continuous feature들의 column 이름 : {cont_cols}")

In [None]:
print(f"category feature들의 index : {cate2id_dict}")

In [None]:
print(f"train셋 sequence 데이터들의 indices : {question_row_ids_by_user_id}\n")
print(f"train셋 각 sequence 데이터들의 첫 row의 index : {start_row_id_by_user_id}")

In [None]:
class CFG_T:
    seed=7
    device='cpu'

    batch_size=16

    dropout=0.2
    emb_size=100
    hidden_size=128
    nlayers=2
    nheads=8
  
    seq_len=32
    target_size=1

In [None]:
CFG_T.total_cate_size = offset
CFG_T.cate_cols = cate_cols
CFG_T.cont_cols = cont_cols
CFG_T.start_row_id_by_user_id = start_row_id_by_user_id

CFG_T.cate_vocab_size = offset

CFG_T.cate_col_size = len(cate_cols)
CFG_T.cont_col_size = len(cont_cols)

In [None]:
# 이 sequence들의 index들을 그대로 사용하지 않는다
question_row_ids_by_user_id

In [None]:
train_user_id_row_id_list = [(user_id, row_id)
                             for user_id, row_ids in question_row_ids_by_user_id.items()
                             for row_id in row_ids]
train_user_id_row_id_list[:10]

In [None]:
# row가 꽤나 늘어났음을 알 수 있다.
len(train_user_id_row_id_list)

In [None]:
# configuration에 등록!
CFG_T.train_user_id_row_id_list = train_user_id_row_id_list

In [None]:
train_df[cate_cols]

In [None]:
train_df[cont_cols]

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class RiiidDataset(Dataset):
    def __init__(self, df, cfg, max_seq_len=100, max_content_len=1000):        
        
        self.max_seq_len = max_seq_len
        self.max_content_len = max_content_len
        
        self.user_id_row_id_list = cfg.train_user_id_row_id_list
        self.start_row_id_by_user_id = cfg.start_row_id_by_user_id

        self.cate_cols = cfg.cate_cols
        self.cont_cols = cfg.cont_cols
        
        self.cate_features = df[self.cate_cols].values
        self.cont_features = df[self.cont_cols].values

    def __getitem__(self, idx):
        
        user_id, end_row_id = self.user_id_row_id_list[idx]
        end_row_id += 1
        
        start_row_id = self.start_row_id_by_user_id[user_id]
        start_row_id = max(end_row_id - self.max_seq_len, start_row_id) # lower bound
        seq_len = end_row_id - start_row_id

        # 0으로 채워진 output tensor 제작                  
        cate_feature = torch.zeros(self.max_seq_len, len(self.cate_cols), dtype=torch.long)
        cont_feature = torch.zeros(self.max_seq_len, len(self.cont_cols), dtype=torch.float)
        mask = torch.zeros(self.max_seq_len, dtype=torch.int16)
       
        # tensor에 값 채워넣기
        cate_feature[-seq_len:] = torch.ShortTensor(self.cate_features[start_row_id:end_row_id])
        cont_feature[-seq_len:] = torch.HalfTensor(self.cont_features[start_row_id:end_row_id])
        mask[-seq_len:] = 1        
            
        # answered_correctly가 cont_feature[-1]에 위치한다
        target = torch.FloatTensor([cont_feature[-1, -1]])

        # answered_correctly 및 relative_answered_correctly는
        # data leakage가 발생할 수 있으므로 0으로 모두 채운다
        # cont_feature[-1, -1] = 0
        # cont_feature[-1, -2] = 0
        
        return cate_feature, cont_feature, mask, target
        
    def __len__(self):
        return len(self.user_id_row_id_list)

In [None]:
train_db = RiiidDataset(train_df, CFG_T, max_seq_len=CFG_T.seq_len)
train_loader = DataLoader(train_db, batch_size=CFG_T.batch_size, shuffle=True,
                          drop_last=False, pin_memory=True)    

In [None]:
# sequence 데이터 하나의 shape을 살펴보자
for cate_x, cont_x, mask, target in train_db:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}")
    print(f"target size : {target.size()}")
    break

In [None]:
# 배치 단위로 주어지는 데이터를 살펴보자
for cate_x, cont_x, mask, target in train_loader:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}")
    print(f"target size : {target.size()}")
    break

### 📗 Transformer Input / Output 구현
> transformer에 입력시킬 input을 구현하고 transformer를 거친 output을 우리가 원하는 최종 출력값으로 바꾼다.

In [None]:
import torch.nn as nn

# 입력값
for cate_x, cont_x, mask, target in train_loader:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}\n")
    break

#### 🟡 Category Embedding
> 범주형 feature를 임베딩하는 과정을 살펴보자!

In [None]:
cate_x.size()

In [None]:
# 임베딩 크기
CFG_T.emb_size

In [None]:
batch_size = cate_x.size(0)

# 범주형 하나당 100개로 임베딩된다!
# [16, 32, 6] -> [16, 32, 6, 100]
cate_emb = nn.Embedding(CFG_T.total_cate_size, CFG_T.emb_size, padding_idx=0)
cate_embed_x = cate_emb(cate_x)

cate_embed_x.size()

In [None]:
# sequence 길이를 몇 배 줄일 것인지
# 메모리 절약의 의도가 있다
CFG_T.n_rows_per_step = 2

In [None]:
cate_embed_normal_x = cate_embed_x.view(batch_size, CFG_T.seq_len, -1)
cate_embed_normal_x.size()

In [None]:
half_seq_len = cate_x.size(1) // CFG_T.n_rows_per_step

# transformer input은 3차원이고 마지막 차원은 hidden 값이다.
# sequence의 각 위치에 카테고리별로 임베딩되어있는 것을 하나로 합치자!
# [16, 32, 6, 100] -> [16, 16, 1200]
cate_embed_x = cate_embed_x.view(batch_size, half_seq_len, -1)
cate_embed_x.size()

In [None]:
# 이후에 우리가 원하는 hidden_size의 절반으로 projection한다!
# 이렇게 하는 이유는 반은 category로 반은 continous으로 hidden 값을 채우기 위해서이다
# [16, 16, 1200] -> [16, 16, 128]
cate_proj = nn.Sequential(nn.Linear(CFG_T.emb_size * CFG_T.cate_col_size * CFG_T.n_rows_per_step, CFG_T.hidden_size),
                          nn.LayerNorm(CFG_T.hidden_size))     
cate_embed_x = cate_proj(cate_embed_x)
cate_embed_x.size()

#### 🟡 Continuous Embedding
> 수치형 feature를 임베딩하는 과정을 살펴보자!

In [None]:
cont_x.size()

In [None]:
cont_bn = nn.BatchNorm1d(CFG_T.cont_col_size)

# batchnorm 1d 적용
cont_bn_x = cont_bn(cont_x.view(-1, cont_x.size(-1)))
cont_bn_x.size()

In [None]:
# batchnorm 적용 이후 원래 사이즈 복구
cont_bn_x = cont_bn_x.view(batch_size, -1, cont_x.size(-1))
cont_bn_x.size()

In [None]:
# cate에서 사용한 half_seq_len 그대로 사용
cont_bn_x = cont_bn_x.view(batch_size, half_seq_len, -1)
cont_bn_x.size()

In [None]:
# 범주형과는 다르게 embedding없이 바로 projection을 통해 원하는 사이즈로 줄인다
# 여기서는 embedding이라고 부른다
cont_emb = nn.Sequential(nn.Linear(CFG_T.cont_col_size * CFG_T.n_rows_per_step, CFG_T.hidden_size),
                         nn.LayerNorm(CFG_T.hidden_size))
cont_embed_x = cont_emb(cont_bn_x)
cont_embed_x.size()

#### 🟡 범주형 / 수치형 embedding tensor concat
> Transformer에 입력값으로 주려면 범주형 / 수치형으로 embedding된 2개의 tensor를 하나로 합쳐야 한다. 이를 통해 우리는 많은 feature들이 포함된 데이터를 성공적으로 하나의 입력값으로 만들 수 있다!

In [None]:
cate_embed_x.size(), cont_embed_x.size()

In [None]:
seq_emb = torch.cat([cate_embed_x, cont_embed_x], 2)
seq_emb.size()

In [None]:
comb_proj = nn.Sequential(nn.ReLU(),
                          nn.Linear(CFG_T.hidden_size*2, CFG_T.hidden_size),
                          nn.LayerNorm(CFG_T.hidden_size))

# concat한 sequence를 projection을 통해 원하는 사이즈로 변환한다
# 여기서는 embedding이라고 부른다
# [16, 16, 256] -> [16, 16, 128]
seq_emb = comb_proj(seq_emb)
seq_emb.size()

#### 🟡 Encoder
> 이제 완성된 입력값을 모델에 넣어보자!

In [None]:
# !pip install -q transformers

In [None]:
try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel   

config = BertConfig(3, # not used
                    hidden_size=CFG_T.hidden_size,
                    num_hidden_layers=CFG_T.nlayers,
                    num_attention_heads=CFG_T.nheads,
                    intermediate_size=CFG_T.hidden_size,
                    hidden_dropout_prob=CFG_T.dropout,
                    attention_probs_dropout_prob=CFG_T.dropout)

encoder = BertEncoder(config)   

In [None]:
# Bert Encoder를 거친 tensor의 크기는 동일하게 나온다
# [16, 16, 128] -> [16, 16, 128]
encoded_layers = encoder(seq_emb)
sequence_output = encoded_layers[-1]
sequence_output.size()

In [None]:
# 우리가 필요한건 Bert의 마지막 query다
# [16, 16, 128] -> [16, 128]
sequence_output = sequence_output[:, -1]
sequence_output.size()

#### 🟡 분류 단계
> 이제 우리는 최종 분류를 해야한다! 이걸 위해서 우리는 출력의 크기를 클래스 숫자인 1로 변환한다!

In [None]:
def get_reg():
    return nn.Sequential(nn.Linear(CFG_T.hidden_size, CFG_T.hidden_size),
                         nn.LayerNorm(CFG_T.hidden_size),
                         nn.Dropout(CFG_T.dropout),
                         nn.ReLU(),
                         nn.Linear(CFG_T.hidden_size, CFG_T.target_size))

reg_layer = get_reg()

In [None]:
# 😍 우리는 원하는 결과값을 얻었다 😍
# [16, 128] -> [16, 1]
pred_y = reg_layer(sequence_output)
pred_y.size()