In [3]:
import pandas as pd

In [4]:
train = pd.read_csv('../../../../data/train_data.csv')
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


In [5]:
tem = train.groupby('userID')['answerCode'].count()
tem.mean(), tem.std()  ## 유저당 평균 300문제

(338.39743206927443, 321.33142924031654)

In [6]:
tem = train.groupby(['userID', 'testId'])['answerCode'].count()
tem.mean(), tem.std()  ## 평균 6.2 표준편차 1.7 이니, 테스트당 대략 최대 8문제 

(6.207035742844311, 1.7124521291383061)

In [7]:
indexs_by_users =  train.reset_index().groupby('userID')['index'].apply(lambda x: x.values)
indexs_by_users

userID
0       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1       [745, 746, 747, 748, 749, 750, 751, 752, 753, ...
2       [1678, 1679, 1680, 1681, 1682, 1683, 1684, 168...
5       [1954, 1955, 1956, 1957, 1958, 1959, 1960, 196...
6       [2787, 2788, 2789, 2790, 2791, 2792, 2793, 279...
                              ...                        
7436    [2266515, 2266516, 2266517, 2266518, 2266519, ...
7437    [2266530, 2266531, 2266532, 2266533, 2266534, ...
7438    [2266546, 2266547, 2266548, 2266549, 2266550, ...
7440    [2266562, 2266563, 2266564, 2266565, 2266566, ...
7441    [2266577, 2266578, 2266579, 2266580, 2266581, ...
Name: index, Length: 6698, dtype: object

In [8]:
start_index_by_user_id = indexs_by_users.apply(lambda x: x[0])
start_index_by_user_id

userID
0             0
1           745
2          1678
5          1954
6          2787
         ...   
7436    2266515
7437    2266530
7438    2266546
7440    2266562
7441    2266577
Name: index, Length: 6698, dtype: int64

In [9]:
# 범주형과 수치형 column의 이름
cate_cols = ['userID', 'assessmentItemID', 'KnowledgeTag']
cont_cols = ['answerCode']  # target을 꼭 맨 뒤에 놓을것!

# feature engineering을 통해서 생성한 column들의 이름
extra_cont_cols = list(train.columns[15:-4])

mappers_dict = {}

# nan 값이 0이므로 위해 offset은 1에서 출발한다
cate_offset = 1

for col in cate_cols:

    # 각 column마다 mapper를 만든다
    cate2idx = {}
    for v in train[col].unique():

        # np.nan != np.nan은 True가 나온다
        # nan 및 None은 넘기는 코드
        if (v != v) | (v == None):
            continue 

        # nan을 고려하여 offset을 추가한다
        cate2idx[v] = len(cate2idx) + cate_offset

    mappers_dict[col] = cate2idx

    # mapping
    train[col] = train[col].map(cate2idx).fillna(0).astype(int)

    # 하나의 embedding layer를 사용할 것이므로 다른 feature들이 사용한 index값을
    # 제외하기 위해 offset값을 지속적으로 추가한다
    cate_offset += len(cate2idx)

In [10]:
mappers_dict

{'userID': {0: 1,
  1: 2,
  2: 3,
  5: 4,
  6: 5,
  7: 6,
  8: 7,
  9: 8,
  10: 9,
  11: 10,
  12: 11,
  14: 12,
  15: 13,
  16: 14,
  18: 15,
  19: 16,
  20: 17,
  21: 18,
  22: 19,
  23: 20,
  24: 21,
  25: 22,
  27: 23,
  28: 24,
  30: 25,
  31: 26,
  32: 27,
  33: 28,
  34: 29,
  35: 30,
  36: 31,
  37: 32,
  38: 33,
  39: 34,
  40: 35,
  41: 36,
  42: 37,
  43: 38,
  44: 39,
  46: 40,
  47: 41,
  48: 42,
  49: 43,
  50: 44,
  51: 45,
  52: 46,
  54: 47,
  55: 48,
  56: 49,
  57: 50,
  59: 51,
  60: 52,
  61: 53,
  62: 54,
  63: 55,
  65: 56,
  66: 57,
  67: 58,
  68: 59,
  69: 60,
  70: 61,
  71: 62,
  72: 63,
  73: 64,
  74: 65,
  75: 66,
  77: 67,
  78: 68,
  80: 69,
  81: 70,
  82: 71,
  83: 72,
  84: 73,
  85: 74,
  86: 75,
  87: 76,
  88: 77,
  89: 78,
  90: 79,
  91: 80,
  92: 81,
  93: 82,
  95: 83,
  96: 84,
  97: 85,
  98: 86,
  99: 87,
  100: 88,
  101: 89,
  102: 90,
  103: 91,
  104: 92,
  105: 93,
  106: 94,
  107: 95,
  108: 96,
  109: 97,
  110: 98,
  111: 99,
  112

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266586 entries, 0 to 2266585
Data columns (total 6 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   userID            int64 
 1   assessmentItemID  int64 
 2   testId            object
 3   answerCode        int64 
 4   Timestamp         object
 5   KnowledgeTag      int64 
dtypes: int64(4), object(2)
memory usage: 103.8+ MB


In [12]:
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,1,6699,A060000001,1,2020-03-24 00:17:11,16153
1,1,6700,A060000001,1,2020-03-24 00:17:14,16154
2,1,6701,A060000001,1,2020-03-24 00:17:22,16154
3,1,6702,A060000001,1,2020-03-24 00:17:29,16154
4,1,6703,A060000001,1,2020-03-24 00:17:36,16154


In [13]:
class CFG:
    seed = 7
    device = 'cuda'

    batch_size = 16

    dropout = 0.2
    emb_size = 100
    hidden_size = 128
    nlayers = 2
    nheads = 8
  
    seq_len = 32
    target_size=1
    

In [14]:
CFG.total_cate_size = cate_offset
CFG.cate_cols = cate_cols
CFG.cont_cols = cont_cols
CFG.start_index_by_user_id = start_index_by_user_id

CFG.cate_vocab_size = cate_offset

CFG.cate_col_size = len(cate_cols)
CFG.cont_col_size = len(cont_cols)

In [15]:
indexs_by_users

userID
0       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1       [745, 746, 747, 748, 749, 750, 751, 752, 753, ...
2       [1678, 1679, 1680, 1681, 1682, 1683, 1684, 168...
5       [1954, 1955, 1956, 1957, 1958, 1959, 1960, 196...
6       [2787, 2788, 2789, 2790, 2791, 2792, 2793, 279...
                              ...                        
7436    [2266515, 2266516, 2266517, 2266518, 2266519, ...
7437    [2266530, 2266531, 2266532, 2266533, 2266534, ...
7438    [2266546, 2266547, 2266548, 2266549, 2266550, ...
7440    [2266562, 2266563, 2266564, 2266565, 2266566, ...
7441    [2266577, 2266578, 2266579, 2266580, 2266581, ...
Name: index, Length: 6698, dtype: object

In [16]:
train_user_id_index_list = [(user_id, index)
                             for user_id, indexs in indexs_by_users.items()
                             for index in indexs]
train_user_id_index_list[:10]

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9)]

In [17]:
len(train_user_id_index_list)

2266586

In [18]:
CFG.train_user_id_index_list = train_user_id_index_list

In [19]:
train[cate_cols].values

array([[    1,  6699, 16153],
       [    1,  6700, 16154],
       [    1,  6701, 16154],
       ...,
       [ 6698,  7986, 16427],
       [ 6698,  7987, 16427],
       [ 6698,  7988, 16427]])

In [20]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class TransformerDataset(Dataset):
    def __init__(self, df, cfg, max_seq_len=100, max_content_len=1000):        
        
        self.max_seq_len = max_seq_len
        self.max_content_len = max_content_len
        
        self.user_id_index_list = cfg.train_user_id_index_list
        self.start_index_by_user_id = cfg.start_index_by_user_id

        self.cate_cols = cfg.cate_cols
        self.cont_cols = cfg.cont_cols
        
        self.cate_features = df[self.cate_cols].values
        self.cont_features = df[self.cont_cols].values

    def __getitem__(self, idx):
        
        # end_index 추출
        user_id, end_index = self.user_id_index_list[idx]
        end_index += 1
        
        # start_index 계산
        start_index = self.start_index_by_user_id[user_id]
        start_index = max(end_index - self.max_seq_len, start_index)
        seq_len = end_index - start_index

        # 0으로 채워진 output tensor 제작                  
        cate_feature = torch.zeros(self.max_seq_len, len(self.cate_cols), dtype=torch.long)
        cont_feature = torch.zeros(self.max_seq_len, len(self.cont_cols), dtype=torch.float)
        mask = torch.zeros(self.max_seq_len, dtype=torch.int16)
       
        # tensor에 값 채워넣기
        cate_feature[-seq_len:] = torch.ShortTensor(self.cate_features[start_index:end_index]) # 16bit signed integer
        cont_feature[-seq_len:] = torch.HalfTensor(self.cont_features[start_index:end_index]) # 16bit float
        mask[-seq_len:] = 1        
            
        # target은 꼭 cont_feature의 맨 뒤에 놓자
        target = torch.FloatTensor([cont_feature[-1, -1]])

        # data leakage가 발생할 수 있으므로 0으로 모두 채운다
        cont_feature[-1, -1] = 0
        
        return cate_feature, cont_feature, mask, target
        
    def __len__(self):
        return len(self.user_id_index_list)

In [21]:
train_db = TransformerDataset(train, CFG, max_seq_len=CFG.seq_len)
train_loader = DataLoader(train_db, batch_size=CFG.batch_size, shuffle=True,
                          drop_last=False, pin_memory=True)    

In [22]:
for cate_x, cont_x, mask, target in train_db:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}")
    print(f"target size : {target.size()}")
    break

category size : torch.Size([32, 3])
continous size : torch.Size([32, 1])
mask size : torch.Size([32])
target size : torch.Size([1])


In [23]:
for cate_x, cont_x, mask, target in train_loader:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}")
    print(f"target size : {target.size()}")
    break

category size : torch.Size([16, 32, 3])
continous size : torch.Size([16, 32, 1])
mask size : torch.Size([16, 32])
target size : torch.Size([16, 1])


In [24]:
cate_x.size()

torch.Size([16, 32, 3])

In [25]:
CFG.emb_size

100

In [26]:
from torch import nn 

In [27]:
cate_emb = nn.Embedding(CFG.total_cate_size, CFG.emb_size, padding_idx=0)

In [29]:
cate_embed_x = cate_emb(cate_x)

cate_x.size(), cate_embed_x.size()

(torch.Size([16, 32, 3]), torch.Size([16, 32, 3, 100]))

In [34]:
CFG.index_per_step = 2

In [35]:
cate_embed_normal_x = cate_embed_x.view(CFG.batch_size, CFG.seq_len, -1)
cate_embed_normal_x.size()

torch.Size([16, 32, 300])

In [36]:
half_seq_len = cate_x.size(1) // CFG.index_per_step

# transformer input은 3차원이고 마지막 차원은 hidden 값이다.
# sequence의 각 위치에 카테고리별로 임베딩되어있는 것을 하나로 합치자!
# [16, 32, 4, 100] -> [16, 16, 800]
cate_embed_x = cate_embed_x.view(CFG.batch_size, half_seq_len, -1)
cate_embed_x.size()

torch.Size([16, 16, 600])

In [37]:
# 원하는 사이즈로 줄인다
cate_proj = nn.Sequential(nn.Linear(CFG.emb_size * CFG.cate_col_size * CFG.index_per_step, CFG.hidden_size),
                          nn.LayerNorm(CFG.hidden_size))     
cate_embed_x = cate_proj(cate_embed_x)
cate_embed_x.size()

torch.Size([16, 16, 128])

In [38]:
cont_x.size()

torch.Size([16, 32, 1])

In [39]:
cont_bn = nn.BatchNorm1d(CFG.cont_col_size)

# batchnorm 1d 적용
cont_bn_x = cont_bn(cont_x.view(-1, cont_x.size(-1)))
cont_bn_x.size()

torch.Size([512, 1])

In [41]:
# batchnorm 적용 이후 원래 사이즈 복구
cont_bn_x = cont_bn_x.view(CFG.batch_size, -1, cont_x.size(-1))
cont_bn_x.size()

torch.Size([16, 32, 1])

In [43]:
# cate에서 사용한 half_seq_len 그대로 사용
cont_bn_x = cont_bn_x.view(CFG.batch_size, half_seq_len, -1)
cont_bn_x.size()

torch.Size([16, 16, 2])

In [45]:
# 범주형과는 다르게 embedding없이 바로 projction을 통해 원하는 사이즈로 변경한다
# 여기서는 embedding이라고 부른다
# [16, 16, 36] -> [16, 16, 128]
cont_emb = nn.Sequential(nn.Linear(CFG.cont_col_size * CFG.index_per_step, CFG.hidden_size),
                         nn.LayerNorm(CFG.hidden_size))
cont_embed_x = cont_emb(cont_bn_x)
cont_embed_x.size()

torch.Size([16, 16, 128])

In [46]:
seq_emb = torch.cat([cate_embed_x, cont_embed_x], 2)
seq_emb.size()

torch.Size([16, 16, 256])

In [47]:
comb_proj = nn.Sequential(nn.ReLU(),
                          nn.Linear(CFG.hidden_size*2, CFG.hidden_size),
                          nn.LayerNorm(CFG.hidden_size))

# concat한 sequence를 projection을 통해 원하는 사이즈로 변환한다
# 여기서는 embedding이라고 부른다
# [16, 16, 256] -> [16, 16, 128]
seq_emb = comb_proj(seq_emb)
seq_emb.size()

torch.Size([16, 16, 128])

In [49]:
!pip install transformers



In [50]:
try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel   

config = BertConfig(3, # not used
                    hidden_size=CFG.hidden_size,
                    num_hidden_layers=CFG.nlayers,
                    num_attention_heads=CFG.nheads,
                    intermediate_size=CFG.hidden_size,
                    hidden_dropout_prob=CFG.dropout,
                    attention_probs_dropout_prob=CFG.dropout)

encoder = BertEncoder(config)   

In [51]:
encoded_layers = encoder(seq_emb)
sequence_output = encoded_layers[-1]
sequence_output.size()

torch.Size([16, 16, 128])

In [52]:
sequence_output = sequence_output[:, -1]
sequence_output.size()

torch.Size([16, 128])

In [53]:
def get_reg():
    return nn.Sequential(nn.Linear(CFG.hidden_size, CFG.hidden_size),
                         nn.LayerNorm(CFG.hidden_size),
                         nn.Dropout(CFG.dropout),
                         nn.ReLU(),
                         nn.Linear(CFG.hidden_size, CFG.target_size))

reg_layer = get_reg()

In [55]:
# 😍 우리는 원하는 결과값을 얻었다 😍
# [16, 128] -> [16, 1]
pred_y = reg_layer(sequence_output)
pred_y.size()

torch.Size([16, 1])

In [56]:
import torch.nn as nn

try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel   

class TransformerModel(nn.Module):
    def __init__(self, cfg):
        super(TransformerModel, self).__init__()
        self.cfg = cfg

        cate_col_size = len(cfg.cate_cols)
        cont_col_size = len(cfg.cont_cols)

        # category
        self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)
        self.cate_proj = nn.Sequential(
            nn.Linear(cfg.emb_size * cfg.cate_col_size * cfg.index_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )

        # continuous
        self.cont_bn = nn.BatchNorm1d(cfg.cont_col_size)
        self.cont_emb = nn.Sequential(
            nn.Linear(cfg.cont_col_size*cfg.index_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )

        # combination
        self.comb_proj = nn.Sequential(
            nn.ReLU(),
            nn.Linear(cfg.hidden_size*2, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        
        self.config = BertConfig( 
            3, # not used
            hidden_size=cfg.hidden_size,
            num_hidden_layers=cfg.nlayers,
            num_attention_heads=cfg.nheads,
            intermediate_size=cfg.hidden_size,
            hidden_dropout_prob=cfg.dropout,
            attention_probs_dropout_prob=cfg.dropout,
        )
        self.encoder = BertEncoder(self.config)        
        
        def get_reg():
            return nn.Sequential(
            nn.Linear(cfg.hidden_size, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
            nn.Dropout(cfg.dropout),
            nn.ReLU(),            
            nn.Linear(cfg.hidden_size, cfg.target_size),
        )     
        self.reg_layer = get_reg()
        
    def forward(self, cate_x, cont_x, mask):        
        batch_size = cate_x.size(0)
        half_seq_len = cate_x.size(1) // self.cfg.index_per_step
        
        # category
        cate_emb = self.cate_emb(cate_x).view(batch_size, half_seq_len, -1)
        cate_emb = self.cate_proj(cate_emb)

        # continuous
        cont_x = self.cont_bn(cont_x.view(-1, cont_x.size(-1))).view(batch_size, -1, cont_x.size(-1))
        cont_emb = self.cont_emb(cont_x.view(batch_size, half_seq_len, -1))        
        
        # combination
        seq_emb = torch.cat([cate_emb, cont_emb], 2)        
        seq_emb = self.comb_proj(seq_emb)   
        
        mask, _ = mask.view(batch_size, half_seq_len, -1).max(2)
        
        encoded_layers = self.encoder(seq_emb, attention_mask=mask)
        sequence_output = encoded_layers[0]
        sequence_output = sequence_output[:, -1]        
        
        pred_y = self.reg_layer(sequence_output)

        return pred_y

In [57]:
# 모델 생성
model = TransformerModel(CFG)

for cate_x, cont_x, mask, target in train_loader:
    print(f"category size : {cate_x.size()}")
    print(f"continous size : {cont_x.size()}")
    print(f"mask size : {mask.size()}\n")

    output = model(cate_x, cont_x, mask)
    
    print(f"output size : {output.size()}")
    print(f"target size : {target.size()}")
    break

category size : torch.Size([16, 32, 3])
continous size : torch.Size([16, 32, 1])
mask size : torch.Size([16, 32])

output size : torch.Size([16, 1])
target size : torch.Size([16, 1])
