# lightGCN

In [2]:
import os

import pandas as pd
import torch
from config import CFG, logging_conf
from lightgcn.datasets import prepare_dataset
from lightgcn.models import build, train
from lightgcn.utils import class2dict, get_logger

from lightgcn.datasets import prepare_dataset
from lightgcn.models import build, inference
from lightgcn.utils import get_logger

train

In [3]:
logger = get_logger(logging_conf)
use_cuda = torch.cuda.is_available() and CFG.use_cuda_if_available
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [4]:
def load_data(basepath):
    path1 = os.path.join(basepath, "train_data.csv")
    path2 = os.path.join(basepath, "test_data.csv")
    data1 = pd.read_csv(path1)
    data2 = pd.read_csv(path2)

    data = pd.concat([data1, data2])
    data.drop_duplicates(
        subset=["userID", "assessmentItemID"], keep="last", inplace=True
    )

    return data


def separate_data(data):
    train_data = data[data.answerCode >= 0]
    test_data = data[data.answerCode < 0]

    return train_data, test_data


def indexing_data(data):
    userid, itemid = (
        sorted(list(set(data.userID))),
        sorted(list(set(data.assessmentItemID))),
    )
    n_user, n_item = len(userid), len(itemid)

    userid_2_index = {v: i for i, v in enumerate(userid)}
    itemid_2_index = {v: i + n_user for i, v in enumerate(itemid)}
    id_2_index = dict(userid_2_index, **itemid_2_index)

    return id_2_index


def process_data(data, id_2_index, device):
    edge, label = [], []
    for user, item, acode in zip(data.userID, data.assessmentItemID, data.answerCode):
        uid, iid = id_2_index[user], id_2_index[item]
        edge.append([uid, iid])
        label.append(acode)

    edge = torch.LongTensor(edge).T
    label = torch.LongTensor(label)

    return dict(edge=edge.to(device), label=label.to(device))


In [5]:
def prepare_dataset(device, basepath, verbose=True, logger=None):
    data = load_data(basepath)
    train_data, test_data = separate_data(data)
    id2index = indexing_data(data)
    train_data_proc = process_data(train_data, id2index, device)
    test_data_proc = process_data(test_data, id2index, device)

    # if verbose:
    #     print_data_stat(train_data, "Train", logger=logger)
    #     print_data_stat(test_data, "Test", logger=logger)

    return train_data_proc, test_data_proc, len(id2index), id2index 

In [6]:
# data prepare
train_data, test_data, n_node, edge_index = prepare_dataset(
    device, CFG.basepath, verbose=CFG.loader_verbose, logger=logger.getChild("data")
)

In [7]:
# model build
model = build(
    n_node,
    embedding_dim=CFG.embedding_dim,
    num_layers=CFG.num_layers,
    alpha=CFG.alpha,
    logger=logger.getChild("build"),
    **CFG.build_kwargs
)
model.to(device)

2022-11-30 17:05:28,419 - build - INFO - No load model


LightGCN(16896, 64, num_layers=1)

In [8]:
# model train
train(
    model,
    train_data,
    n_epoch=CFG.n_epoch,
    learning_rate=CFG.learning_rate,
    use_wandb=False,
    weight=CFG.weight_basepath,
    logger=logger.getChild("train"),
)
logger.info("Task Complete")

2022-11-30 17:05:28,746 - train - INFO - Training Started : n_epoch=20
2022-11-30 17:05:28,785 - train - INFO -  * In epoch 0001, loss=0.693, acc=0.590, AUC=0.610
2022-11-30 17:05:28,788 - train - INFO -  * In epoch 0001, loss=0.693, acc=0.590, AUC=0.610, Best AUC
2022-11-30 17:05:28,858 - train - INFO -  * In epoch 0002, loss=0.693, acc=0.786, AUC=0.835
2022-11-30 17:05:28,860 - train - INFO -  * In epoch 0002, loss=0.693, acc=0.786, AUC=0.835, Best AUC
2022-11-30 17:05:28,922 - train - INFO -  * In epoch 0003, loss=0.685, acc=0.783, AUC=0.827
2022-11-30 17:05:28,958 - train - INFO -  * In epoch 0004, loss=0.659, acc=0.774, AUC=0.817
2022-11-30 17:05:28,992 - train - INFO -  * In epoch 0005, loss=0.616, acc=0.775, AUC=0.812
2022-11-30 17:05:29,027 - train - INFO -  * In epoch 0006, loss=0.563, acc=0.777, AUC=0.814
2022-11-30 17:05:29,062 - train - INFO -  * In epoch 0007, loss=0.516, acc=0.785, AUC=0.824
2022-11-30 17:05:29,096 - train - INFO -  * In epoch 0008, loss=0.487, acc=0.790,

inference

In [9]:
if not os.path.exists(CFG.output_dir):
    os.makedirs(CFG.output_dir)

In [10]:
pred = inference(model, test_data, logger=logger.getChild("infer"))

In [11]:
pred = pred.detach().cpu().numpy()
pd.DataFrame({"prediction": pred}).to_csv(
        os.path.join(CFG.output_dir, "plus_test_submission.csv"), index_label="id"
)

### lightGCN 임베딩 추출

In [12]:
n_node

16896

In [13]:
model.embedding.weight.shape

torch.Size([16896, 64])

In [14]:
model.embedding.weight

Parameter containing:
tensor([[ 1.7002, -0.8157, -1.2023,  ...,  0.2400, -1.0829,  0.0275],
        [-1.3248, -0.8314, -1.2569,  ...,  1.0755,  1.0706, -1.2358],
        [-0.9595,  1.2628, -1.1110,  ...,  1.5792,  1.1490, -1.5240],
        ...,
        [ 0.0828, -1.1036, -0.8915,  ..., -0.0332, -1.6060, -0.2426],
        [ 0.9188, -0.6512, -0.8878,  ...,  0.0920,  1.0224, -1.4174],
        [-0.2551, -1.1963, -0.4450,  ...,  0.4867,  0.5178, -0.2477]],
       device='cuda:0', requires_grad=True)

In [15]:
embed = model.get_embedding(train_data['edge'])

In [16]:
embed.to(device)

tensor([[ 0.8501, -0.4079, -0.6012,  ...,  0.1200, -0.5414,  0.0137],
        [-0.6624, -0.4157, -0.6285,  ...,  0.5377,  0.5353, -0.6179],
        [-0.4798,  0.6314, -0.5555,  ...,  0.7896,  0.5745, -0.7620],
        ...,
        [ 0.0414, -0.5518, -0.4457,  ..., -0.0166, -0.8030, -0.1213],
        [ 0.4594, -0.3256, -0.4439,  ...,  0.0460,  0.5112, -0.7087],
        [-0.1275, -0.5982, -0.2225,  ...,  0.2433,  0.2589, -0.1238]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [17]:
# totransdim = torch.nn.Linear(CFG.embedding_dim, 21).to(device)

In [18]:
# embed = totransdim(embed)

In [19]:
embed.shape

torch.Size([16896, 64])

In [20]:
p1 = embed[edge_index[0]]
p2 = embed[edge_index['A060001001']]
print(p1)

tensor([ 0.8501, -0.4079, -0.6012, -0.2696, -0.6387, -0.7295, -0.6195,  0.3441,
         0.2355, -0.6734, -0.1753,  0.1448, -0.5484, -0.3062, -0.5339,  0.2712,
        -0.3449, -0.6063,  0.4049, -0.0322, -0.0498, -0.1335,  0.4554, -0.0204,
         0.1977, -0.5576, -0.5067,  0.1893,  0.3777, -0.0016, -0.0326,  0.0878,
         0.0398,  0.4402,  0.0146, -0.4201,  0.1500, -0.0623, -0.5821,  0.4405,
        -0.5015, -0.1626,  0.7630, -0.0102,  0.6478,  0.5081,  0.5665, -0.5410,
         0.1704,  0.2420, -0.2196,  0.3652, -0.0361,  0.1473, -0.3683,  0.1398,
         0.0205,  0.7804, -0.0724, -0.1828,  0.9545,  0.1200, -0.5414,  0.0137],
       device='cuda:0', grad_fn=<SelectBackward0>)


# Riiid

코드 : 5강 실습 코드

In [21]:
import os
import pandas as pd

# RiiiD 데이터셋 path 설정
RIIID_PATH = "/opt/ml/input/data/"

# 데이터셋 불러오기
train_df = pd.read_csv(os.path.join(RIIID_PATH, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(RIIID_PATH, 'test_data.csv'))
submission_df = pd.read_csv(os.path.join(RIIID_PATH, 'sample_submission.csv'))

In [22]:
# 학습 과정에서 학습 샘플을 생성하기 위해서 필요한 유저별 row_ids를 저장
question_row_ids_by_user_id = train_df.groupby('userID').apply(lambda x: x.index.tolist())
question_row_ids_by_user_id.reset_index().head()

Unnamed: 0,userID,0
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,1,"[745, 746, 747, 748, 749, 750, 751, 752, 753, ..."
2,2,"[1678, 1679, 1680, 1681, 1682, 1683, 1684, 168..."
3,5,"[1954, 1955, 1956, 1957, 1958, 1959, 1960, 196..."
4,6,"[2787, 2788, 2789, 2790, 2791, 2792, 2793, 279..."


In [23]:
# 학습 과정에서 학습 샘플을 생성하기 위해서 필요한 유저별 시작 row_id를 저장
start_row_id_by_user_id = train_df.groupby('userID').apply(lambda x: x.index[0])
start_row_id_by_user_id.reset_index().head()

Unnamed: 0,userID,0
0,0,0
1,1,745
2,2,1678
3,5,1954
4,6,2787


In [24]:
# feature 추가
train_df['big_category'] = train_df.testId.map(lambda x:x[2]).astype(int)
train_df['mid_category'] = train_df.testId.map(lambda x: int(x[-3:]))
train_df['problem_num'] = train_df.assessmentItemID.map(lambda x: int(x[-3:]))

# 데이터 타입 변경
train_df['KnowledgeTag'] = train_df['KnowledgeTag'].astype(str)
train_df['big_category'] = train_df['big_category'].astype(str)
train_df['mid_category'] = train_df['mid_category'].astype(str)
train_df['problem_num'] = train_df['problem_num'].astype(str)

In [25]:
cate2id_dict = {}

offset = 0

# assessmentItemID2id
Item2id = dict([(v, i+offset) for i, v in enumerate(train_df['assessmentItemID'].unique())])
cate2id_dict['Item2id'] = Item2id
offset += len(Item2id)

# testId2id
testId2id = dict([(v, i+offset) for i, v in enumerate(train_df['testId'].unique())])
cate2id_dict['testId2id'] = testId2id
offset += len(testId2id)

# KnowledgeTag2id
KnowledgeTag2id = dict([(v, i+offset) for i, v in enumerate(train_df['KnowledgeTag'].unique())])
cate2id_dict['KnowledgeTag2id'] = KnowledgeTag2id
offset += len(KnowledgeTag2id)

# big_category2id
big_category2id = dict([(v, i+offset) for i, v in enumerate(train_df['big_category'].unique())])
cate2id_dict['big_category2id'] = big_category2id
offset += len(big_category2id)
        
# mid_category2id
mid_category2id = dict([(v, i+offset) for i, v in enumerate(train_df['mid_category'].unique())])
cate2id_dict['mid_category2id'] = mid_category2id
offset += len(mid_category2id)

# problem_num2id
problem_num2id = dict([(v, i+offset) for i, v in enumerate(train_df['problem_num'].unique())])
cate2id_dict['problem_num2id'] = problem_num2id
offset += len(problem_num2id)


In [26]:
import numpy as np

# mapping
train_df['assessmentItemID'] = train_df['assessmentItemID'].map(Item2id)
train_df['testId'] = train_df['testId'].map(testId2id)
train_df['KnowledgeTag'] = train_df['KnowledgeTag'].map(KnowledgeTag2id)
train_df['big_category'] = train_df['big_category'].map(big_category2id)
train_df['mid_category'] = train_df['mid_category'].map(mid_category2id)
train_df['problem_num'] = train_df['problem_num'].map(problem_num2id)

In [27]:
# Timestamp 변경하기

from datetime import datetime
import time

def convert_time(s):
    timestamp = time.mktime(
        datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
    )
    return int(timestamp)

train_df["Timestamp"] = train_df["Timestamp"].apply(convert_time)

In [28]:
cont_cols = [ #'Timestamp',
             'answerCode']

cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag', 'big_category', 'mid_category', 'problem_num'] 

train_df[cate_cols] = train_df[cate_cols].astype(np.int16)
# train_df[cont_cols] = train_df[cont_cols].astype(np.float32)  

In [29]:
print(f"훈련 데이터셋 shape : {train_df.shape}")
print(f"category 값들의 총 갯수 : {offset}")
print(f"category feature들의 column 이름 : {cate_cols}")
print(f"continuous feature들의 column 이름 : {cont_cols}")

훈련 데이터셋 shape : (2266586, 9)
category 값들의 총 갯수 : 12123
category feature들의 column 이름 : ['assessmentItemID', 'testId', 'KnowledgeTag', 'big_category', 'mid_category', 'problem_num']
continuous feature들의 column 이름 : ['answerCode']


In [30]:
print(f"category feature들의 index : {cate2id_dict}")

category feature들의 index : {'Item2id': {'A060001001': 0, 'A060001002': 1, 'A060001003': 2, 'A060001004': 3, 'A060001005': 4, 'A060001007': 5, 'A060003001': 6, 'A060003002': 7, 'A060003003': 8, 'A060003004': 9, 'A060003005': 10, 'A060003006': 11, 'A060003007': 12, 'A060005001': 13, 'A060005002': 14, 'A060005003': 15, 'A060005004': 16, 'A060005005': 17, 'A060005006': 18, 'A060005007': 19, 'A060007001': 20, 'A060007002': 21, 'A060007003': 22, 'A060007004': 23, 'A060007005': 24, 'A060007006': 25, 'A060007007': 26, 'A080002001': 27, 'A080002002': 28, 'A080002003': 29, 'A080002004': 30, 'A080002005': 31, 'A080002006': 32, 'A060009001': 33, 'A060009002': 34, 'A060009003': 35, 'A060009004': 36, 'A060009005': 37, 'A060009006': 38, 'A060009007': 39, 'A060016001': 40, 'A060016002': 41, 'A060016003': 42, 'A060016004': 43, 'A060016005': 44, 'A060016006': 45, 'A060016007': 46, 'A080004001': 47, 'A080004002': 48, 'A080004003': 49, 'A080004004': 50, 'A080004005': 51, 'A080004006': 52, 'A080004008': 53

In [31]:
print(f"train셋 sequence 데이터들의 indices : {question_row_ids_by_user_id}\n")
print(f"train셋 각 sequence 데이터들의 첫 row의 index : {start_row_id_by_user_id}")

train셋 sequence 데이터들의 indices : userID
0       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1       [745, 746, 747, 748, 749, 750, 751, 752, 753, ...
2       [1678, 1679, 1680, 1681, 1682, 1683, 1684, 168...
5       [1954, 1955, 1956, 1957, 1958, 1959, 1960, 196...
6       [2787, 2788, 2789, 2790, 2791, 2792, 2793, 279...
                              ...                        
7436    [2266515, 2266516, 2266517, 2266518, 2266519, ...
7437    [2266530, 2266531, 2266532, 2266533, 2266534, ...
7438    [2266546, 2266547, 2266548, 2266549, 2266550, ...
7440    [2266562, 2266563, 2266564, 2266565, 2266566, ...
7441    [2266577, 2266578, 2266579, 2266580, 2266581, ...
Length: 6698, dtype: object

train셋 각 sequence 데이터들의 첫 row의 index : userID
0             0
1           745
2          1678
5          1954
6          2787
         ...   
7436    2266515
7437    2266530
7438    2266546
7440    2266562
7441    2266577
Length: 6698, dtype: int64


In [32]:
class CFG_T:
    seed=7
    device='cpu'

    batch_size=16

    dropout=0.2
    emb_size=100
    hidden_size=128
    nlayers=2
    nheads=8
  
    seq_len=32
    target_size=1

In [33]:
CFG_T.total_cate_size = offset
CFG_T.cate_cols = cate_cols
CFG_T.cont_cols = cont_cols
CFG_T.start_row_id_by_user_id = start_row_id_by_user_id

CFG_T.cate_vocab_size = offset

CFG_T.cate_col_size = len(cate_cols)
CFG_T.cont_col_size = len(cont_cols)

In [34]:
# 이 sequence들의 index들을 그대로 사용하지 않는다
question_row_ids_by_user_id

userID
0       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1       [745, 746, 747, 748, 749, 750, 751, 752, 753, ...
2       [1678, 1679, 1680, 1681, 1682, 1683, 1684, 168...
5       [1954, 1955, 1956, 1957, 1958, 1959, 1960, 196...
6       [2787, 2788, 2789, 2790, 2791, 2792, 2793, 279...
                              ...                        
7436    [2266515, 2266516, 2266517, 2266518, 2266519, ...
7437    [2266530, 2266531, 2266532, 2266533, 2266534, ...
7438    [2266546, 2266547, 2266548, 2266549, 2266550, ...
7440    [2266562, 2266563, 2266564, 2266565, 2266566, ...
7441    [2266577, 2266578, 2266579, 2266580, 2266581, ...
Length: 6698, dtype: object

In [35]:
train_user_id_row_id_list = [(user_id, row_id)
                             for user_id, row_ids in question_row_ids_by_user_id.items()
                             for row_id in row_ids]
train_user_id_row_id_list[:10]

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9)]

In [36]:
# row가 꽤나 늘어났음을 알 수 있다.
len(train_user_id_row_id_list)

2266586

In [37]:
# configuration에 등록!
CFG_T.train_user_id_row_id_list = train_user_id_row_id_list

In [38]:
train_df[cate_cols]

Unnamed: 0,assessmentItemID,testId,KnowledgeTag,big_category,mid_category,problem_num
0,0,9454,10991,11903,11912,12110
1,1,9454,10992,11903,11912,12111
2,2,9454,10992,11903,11912,12112
3,3,9454,10992,11903,11912,12113
4,4,9454,10992,11903,11912,12114
...,...,...,...,...,...,...
2266581,3147,9958,11453,11908,12069,12114
2266582,1286,9653,11265,11906,12036,12110
2266583,1287,9653,11265,11906,12036,12111
2266584,1288,9653,11265,11906,12036,12112


In [39]:
train_df[cont_cols]

Unnamed: 0,answerCode
0,1
1,1
2,1
3,1
4,1
...,...
2266581,0
2266582,1
2266583,1
2266584,1


In [40]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class RiiidDataset(Dataset):
    def __init__(self, df, cfg, max_seq_len=100, max_content_len=1000):        
        
        self.max_seq_len = max_seq_len
        self.max_content_len = max_content_len
        
        self.user_id_row_id_list = cfg.train_user_id_row_id_list
        self.start_row_id_by_user_id = cfg.start_row_id_by_user_id

        self.cate_cols = cfg.cate_cols
        self.cont_cols = cfg.cont_cols
        
        self.cate_features = df[self.cate_cols].values
        self.cont_features = df[self.cont_cols].values

    def __getitem__(self, idx):
        
        user_id, end_row_id = self.user_id_row_id_list[idx]
        end_row_id += 1
        
        start_row_id = self.start_row_id_by_user_id[user_id]
        start_row_id = max(end_row_id - self.max_seq_len, start_row_id) # lower bound
        seq_len = end_row_id - start_row_id

        # 0으로 채워진 output tensor 제작                  
        cate_feature = torch.zeros(self.max_seq_len, len(self.cate_cols), dtype=torch.long)
        cont_feature = torch.zeros(self.max_seq_len, len(self.cont_cols), dtype=torch.float)
        mask = torch.zeros(self.max_seq_len, dtype=torch.int16)
       
        # tensor에 값 채워넣기
        cate_feature[-seq_len:] = torch.ShortTensor(self.cate_features[start_row_id:end_row_id])
        cont_feature[-seq_len:] = torch.HalfTensor(self.cont_features[start_row_id:end_row_id])
        mask[-seq_len:] = 1        
            
        # answered_correctly가 cont_feature[-1]에 위치한다
        target = torch.FloatTensor([cont_feature[-1, -1]])

        # answered_correctly 및 relative_answered_correctly는
        # data leakage가 발생할 수 있으므로 0으로 모두 채운다
        # cont_feature[-1, -1] = 0
        # cont_feature[-1, -2] = 0
        
        return cate_feature, cont_feature, mask, target
        
    def __len__(self):
        return len(self.user_id_row_id_list)

In [41]:
train_db = RiiidDataset(train_df, CFG_T, max_seq_len=CFG_T.seq_len)
train_loader = DataLoader(train_db, batch_size=CFG_T.batch_size, shuffle=True,
                          drop_last=False, pin_memory=True)    

In [42]:
# sequence 데이터 하나의 shape을 살펴보자
for cate_x, cont_x, mask, target in train_db:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}")
    print(f"target size : {target.size()}")
    break

category size : torch.Size([32, 6])
continous size : torch.Size([32, 1])
mask size : torch.Size([32])
target size : torch.Size([1])


In [43]:
# 배치 단위로 주어지는 데이터를 살펴보자
for cate_x, cont_x, mask, target in train_loader:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}")
    print(f"target size : {target.size()}")
    break

category size : torch.Size([16, 32, 6])
continous size : torch.Size([16, 32, 1])
mask size : torch.Size([16, 32])
target size : torch.Size([16, 1])


### 📗 Transformer Input / Output 구현
> transformer에 입력시킬 input을 구현하고 transformer를 거친 output을 우리가 원하는 최종 출력값으로 바꾼다.

In [44]:
import torch.nn as nn

# 입력값
for cate_x, cont_x, mask, target in train_loader:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}\n")
    break

category size : torch.Size([16, 32, 6])
continous size : torch.Size([16, 32, 1])
mask size : torch.Size([16, 32])



#### 🟡 Category Embedding
> 범주형 feature를 임베딩하는 과정을 살펴보자!

In [45]:
cate_x.size()

torch.Size([16, 32, 6])

In [46]:
# 임베딩 크기
CFG_T.emb_size

100

In [47]:
cate_x[:,:,1:]

tensor([[[10909, 11050, 11903, 12059, 12115],
         [10857, 11075, 11903, 12010, 12110],
         [10857, 11075, 11903, 12010, 12111],
         ...,
         [10057, 11348, 11909, 11966, 12114],
         [10057, 11348, 11909, 11966, 12116],
         [10057, 11348, 11909, 11966, 12115]],

        [[10018, 11495, 11909, 12002, 12114],
         [10018, 11496, 11909, 12002, 12116],
         [10018, 11496, 11909, 12002, 12115],
         ...,
         [ 9580, 11181, 11907, 11914, 12114],
         [ 9580, 11181, 11907, 11914, 12116],
         [10125, 11548, 11910, 11923, 12110]],

        [[ 9774, 11052, 11904, 12017, 12115],
         [ 9774, 11081, 11904, 12017, 12117],
         [ 9769, 11035, 11904, 12002, 12110],
         ...,
         [ 9772, 11038, 11904, 12014, 12117],
         [ 9775, 11055, 11904, 12018, 12110],
         [ 9775, 11055, 11904, 12018, 12111]],

        ...,

        [[10839, 10992, 11903, 12011, 12111],
         [10839, 10992, 11903, 12011, 12112],
         [10839, 1

In [48]:
embed.shape

torch.Size([16896, 64])

In [49]:
cate_x[:,:,:1].detach().cpu().numpy()

array([[[8998],
        [8687],
        [8688],
        [8689],
        [8690],
        [8691],
        [8692],
        [8693],
        [8694],
        [8695],
        [6809],
        [6810],
        [6811],
        [6812],
        [6813],
        [6814],
        [ 409],
        [ 410],
        [ 411],
        [ 412],
        [ 413],
        [ 414],
        [ 415],
        [ 416],
        [ 417],
        [3687],
        [3686],
        [3688],
        [3689],
        [3690],
        [3691],
        [3692]],

       [[3467],
        [3468],
        [3469],
        [ 846],
        [ 847],
        [ 848],
        [ 849],
        [ 850],
        [ 851],
        [ 852],
        [ 853],
        [3429],
        [3430],
        [3431],
        [3432],
        [3433],
        [3434],
        [3435],
        [1698],
        [1699],
        [1700],
        [1701],
        [1702],
        [1703],
        [1704],
        [ 859],
        [ 860],
        [ 861],
        [ 862],
        [ 863],
      

In [50]:
assess = cate_x[:,:,:1].detach().cpu().numpy()
embed = embed.detach().cpu().numpy()
len_user = train_df['userID'].nunique()

assess_embed = []
for user in assess:
    user_li = []
    for item in user:
        user_li.append(embed[len_user + item])
    assess_embed.append(user_li)

print(assess_embed[0])

[array([[ 0.09341259, -0.36014912,  0.1606276 ,  0.6093997 ,  0.5243095 ,
         0.2520748 , -0.3245924 , -0.2943143 ,  0.10726937, -0.46149   ,
         0.35546046, -0.66332734,  0.6134585 , -0.3390525 ,  0.12364267,
        -0.5621069 ,  0.1652501 ,  0.45031068,  0.30246317,  0.30647552,
        -0.08103045,  0.7088982 , -0.30262157, -0.58451605,  0.56809646,
         0.02209161,  0.3423871 , -0.11579446, -0.79176635,  0.49121153,
        -0.5000289 , -0.13935606,  0.5744165 , -0.44450647,  0.58080596,
         0.09860305,  0.5385321 ,  0.1054225 , -0.4336073 , -0.62677515,
         0.15474933,  0.7095008 ,  0.8633867 , -0.4239331 , -0.6878044 ,
         0.23705101, -0.46290264,  0.02810036,  0.21017908, -0.5766692 ,
         0.4335377 , -0.22417136, -0.05285231,  0.32278836,  0.7102672 ,
        -0.37572646, -0.68272316, -0.6885039 ,  0.1934388 ,  0.11597086,
         0.27693433, -0.12168665,  0.43959787, -0.11678283]],
      dtype=float32), array([[ 0.12662108, -0.22439241, -0.71

In [51]:
assess_embed = torch.Tensor(np.array(assess_embed))
lgcn_linear = nn.Linear(64, 100)
assess_embed = lgcn_linear(assess_embed)
print(assess_embed.size())

torch.Size([16, 32, 1, 100])


In [52]:
batch_size = cate_x.size(0)

# 범주형 하나당 100개로 임베딩된다!
# [16, 32, 6] -> [16, 32, 6, 100]
cate_emb = nn.Embedding(CFG_T.total_cate_size, CFG_T.emb_size, padding_idx=0)
cate_embed_x = cate_emb(cate_x)
print(cate_embed_x.size())

torch.Size([16, 32, 6, 100])


In [53]:
cate_embed_x = torch.cat([assess_embed, cate_embed_x[:,:,1:]], dim=2)

In [54]:
# lightGCN 임베딩(assessmentItemID) 결과와 나머지 컬럼의 임베딩들을 cat해준 것의 shape
cate_embed_x.shape

torch.Size([16, 32, 6, 100])

In [55]:
# sequence 길이를 몇 배 줄일 것인지
# 메모리 절약의 의도가 있다
CFG_T.n_rows_per_step = 2

In [56]:
cate_embed_normal_x = cate_embed_x.view(batch_size, CFG_T.seq_len, -1)
cate_embed_normal_x.size()

torch.Size([16, 32, 600])

In [57]:
half_seq_len = cate_x.size(1) // CFG_T.n_rows_per_step

# transformer input은 3차원이고 마지막 차원은 hidden 값이다.
# sequence의 각 위치에 카테고리별로 임베딩되어있는 것을 하나로 합치자!
# [16, 32, 6, 100] -> [16, 16, 1200]
cate_embed_x = cate_embed_x.view(batch_size, half_seq_len, -1)
cate_embed_x.size()

torch.Size([16, 16, 1200])

In [58]:
# 이후에 우리가 원하는 hidden_size의 절반으로 projection한다!
# 이렇게 하는 이유는 반은 category로 반은 continous으로 hidden 값을 채우기 위해서이다
# [16, 16, 1200] -> [16, 16, 128]
cate_proj = nn.Sequential(nn.Linear(CFG_T.emb_size * CFG_T.cate_col_size * CFG_T.n_rows_per_step, CFG_T.hidden_size),
                          nn.LayerNorm(CFG_T.hidden_size))     
cate_embed_x = cate_proj(cate_embed_x)
cate_embed_x.size()

torch.Size([16, 16, 128])

#### 🟡 Continuous Embedding
> 수치형 feature를 임베딩하는 과정을 살펴보자!

In [59]:
cont_x.size()

torch.Size([16, 32, 1])

In [60]:
cont_bn = nn.BatchNorm1d(CFG_T.cont_col_size)

# batchnorm 1d 적용
cont_bn_x = cont_bn(cont_x.view(-1, cont_x.size(-1)))
cont_bn_x.size()

torch.Size([512, 1])

In [61]:
# batchnorm 적용 이후 원래 사이즈 복구
cont_bn_x = cont_bn_x.view(batch_size, -1, cont_x.size(-1))
cont_bn_x.size()

torch.Size([16, 32, 1])

In [62]:
# cate에서 사용한 half_seq_len 그대로 사용
cont_bn_x = cont_bn_x.view(batch_size, half_seq_len, -1)
cont_bn_x.size()

torch.Size([16, 16, 2])

In [63]:
# 범주형과는 다르게 embedding없이 바로 projection을 통해 원하는 사이즈로 줄인다
# 여기서는 embedding이라고 부른다
cont_emb = nn.Sequential(nn.Linear(CFG_T.cont_col_size * CFG_T.n_rows_per_step, CFG_T.hidden_size),
                         nn.LayerNorm(CFG_T.hidden_size))
cont_embed_x = cont_emb(cont_bn_x)
cont_embed_x.size()

torch.Size([16, 16, 128])

#### 🟡 범주형 / 수치형 embedding tensor concat
> Transformer에 입력값으로 주려면 범주형 / 수치형으로 embedding된 2개의 tensor를 하나로 합쳐야 한다. 이를 통해 우리는 많은 feature들이 포함된 데이터를 성공적으로 하나의 입력값으로 만들 수 있다!

In [64]:
cate_embed_x.size(), cont_embed_x.size()

(torch.Size([16, 16, 128]), torch.Size([16, 16, 128]))

In [65]:
seq_emb = torch.cat([cate_embed_x, cont_embed_x], 2)
seq_emb.size()

torch.Size([16, 16, 256])

In [66]:
comb_proj = nn.Sequential(nn.ReLU(),
                          nn.Linear(CFG_T.hidden_size*2, CFG_T.hidden_size),
                          nn.LayerNorm(CFG_T.hidden_size))

# concat한 sequence를 projection을 통해 원하는 사이즈로 변환한다
# 여기서는 embedding이라고 부른다
# [16, 16, 256] -> [16, 16, 128]
seq_emb = comb_proj(seq_emb)
seq_emb.size()

torch.Size([16, 16, 128])

#### 🟡 Encoder
> 이제 완성된 입력값을 모델에 넣어보자!

In [67]:
# !pip install -q transformers

In [68]:
try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel   

config = BertConfig(3, # not used
                    hidden_size=CFG_T.hidden_size,
                    num_hidden_layers=CFG_T.nlayers,
                    num_attention_heads=CFG_T.nheads,
                    intermediate_size=CFG_T.hidden_size,
                    hidden_dropout_prob=CFG_T.dropout,
                    attention_probs_dropout_prob=CFG_T.dropout)

encoder = BertEncoder(config)   

In [69]:
# Bert Encoder를 거친 tensor의 크기는 동일하게 나온다
# [16, 16, 128] -> [16, 16, 128]
encoded_layers = encoder(seq_emb)
sequence_output = encoded_layers[-1]
sequence_output.size()

torch.Size([16, 16, 128])

In [70]:
# 우리가 필요한건 Bert의 마지막 query다
# [16, 16, 128] -> [16, 128]
sequence_output = sequence_output[:, -1]
sequence_output.size()

torch.Size([16, 128])

#### 🟡 분류 단계
> 이제 우리는 최종 분류를 해야한다! 이걸 위해서 우리는 출력의 크기를 클래스 숫자인 1로 변환한다!

In [71]:
def get_reg():
    return nn.Sequential(nn.Linear(CFG_T.hidden_size, CFG_T.hidden_size),
                         nn.LayerNorm(CFG_T.hidden_size),
                         nn.Dropout(CFG_T.dropout),
                         nn.ReLU(),
                         nn.Linear(CFG_T.hidden_size, CFG_T.target_size))

reg_layer = get_reg()

In [72]:
# 😍 우리는 원하는 결과값을 얻었다 😍
# [16, 128] -> [16, 1]
pred_y = reg_layer(sequence_output)
pred_y.size()

torch.Size([16, 1])

In [73]:
pred_y

tensor([[ 0.5384],
        [-0.7583],
        [-0.1893],
        [-0.1411],
        [-0.1193],
        [ 0.4602],
        [ 0.2241],
        [ 0.3075],
        [ 0.8118],
        [-0.2446],
        [ 0.0624],
        [ 0.2804],
        [-0.5161],
        [ 0.3357],
        [-0.1469],
        [ 0.8788]], grad_fn=<AddmmBackward0>)