# Model Developing

> Notebook that will be used to build model and check the specification

## Hugging face

### DebertaV2
> Will check it out with pretrained and both bert

- [deberta_v2 - huggingface documentation](https://huggingface.co/transformers/v4.8.2/model_doc/deberta_v2.html)
- [deberta-v3-large - huggingface website](https://huggingface.co/microsoft/deberta-v3-large)

### Bert

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel    


class LSTMModel(nn.Module):
    def __init__(self, cfg):
        super(LSTMModel, self).__init__()
        self.cfg = cfg
        self.cate_emb = nn.Embedding(cfg.cate_vocab_size, cfg.emb_size, padding_idx=0)
        self.cate_proj = nn.Sequential(
            nn.Linear(cfg.emb_size*cfg.cate_col_size*cfg.n_rows_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        self.cont_bn = nn.BatchNorm1d(cfg.cont_col_size)
        self.cont_emb = nn.Sequential(
            nn.Linear(cfg.cont_col_size*cfg.n_rows_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        self.comb_proj = nn.Sequential(
            nn.ReLU(),
            nn.Linear(cfg.hidden_size*2, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        
        self.encoder = nn.LSTM(cfg.hidden_size, 
                            cfg.hidden_size, cfg.nlayers, dropout=cfg.dropout, batch_first=True)           
        
        def get_reg():
            return nn.Sequential(
            nn.Linear(cfg.hidden_size, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
            nn.Dropout(cfg.dropout),
            nn.ReLU(),
            nn.Linear(cfg.hidden_size, cfg.target_size),
        )
        self.reg_layer = get_reg()
        
    def forward(self, cate_x, cont_x, mask):        
        batch_size = cate_x.size(0)
        
        cont_x = self.cont_bn(cont_x.view(-1, cont_x.size(-1))).view(batch_size, -1, cont_x.size(-1))
        
        half_seq_len = cate_x.size(1) // self.cfg.n_rows_per_step
        cate_emb = self.cate_emb(cate_x).view(batch_size, half_seq_len, -1)
        cate_emb = self.cate_proj(cate_emb)
        cont_emb = self.cont_emb(cont_x.view(batch_size, half_seq_len, -1))
        
        seq_emb = torch.cat([cate_emb, cont_emb], 2)
        
        seq_emb = self.comb_proj(seq_emb)
        
        _, (h, c) = self.encoder(seq_emb)
        sequence_output = h[-1]
        
        pred_y = self.reg_layer(sequence_output)
        return pred_y


class DSB_BertModel(nn.Module):
    def __init__(self, cfg):
        super(DSB_BertModel, self).__init__()
        self.cfg = cfg
        self.cate_emb = nn.Embedding(cfg.cate_vocab_size, cfg.emb_size, padding_idx=0)
        self.cate_proj = nn.Sequential(
            nn.Linear(cfg.emb_size*cfg.cate_col_size*cfg.n_rows_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        self.cont_bn = nn.BatchNorm1d(cfg.cont_col_size)
        self.cont_emb = nn.Sequential(
            nn.Linear(cfg.cont_col_size*cfg.n_rows_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        self.comb_proj = nn.Sequential(
            nn.ReLU(),
            nn.Linear(cfg.hidden_size*2, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        
        self.config = BertConfig( 
            3, # not used
            hidden_size=cfg.hidden_size,
            num_hidden_layers=cfg.nlayers,
            num_attention_heads=cfg.nheads,
            intermediate_size=cfg.hidden_size,
            hidden_dropout_prob=cfg.dropout,
            attention_probs_dropout_prob=cfg.dropout,            
        )
        self.encoder = BertModel(self.config)        
        
        def get_reg():
            return nn.Sequential(
            nn.Linear(cfg.hidden_size, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
            nn.Dropout(cfg.dropout),
            nn.ReLU(),            
            nn.Linear(cfg.hidden_size, cfg.target_size),
        )
        self.reg_layer = get_reg()
        
    def forward(self, cate_x, cont_x, mask):        
        batch_size = cate_x.size(0)        
        
        cont_x = self.cont_bn(cont_x.view(-1, cont_x.size(-1))).view(batch_size, -1, cont_x.size(-1))        
        
        half_seq_len = cate_x.size(1) // self.cfg.n_rows_per_step
        cate_emb = self.cate_emb(cate_x).view(batch_size, half_seq_len, -1)
        cate_emb = self.cate_proj(cate_emb)
        cont_emb = self.cont_emb(cont_x.view(batch_size, half_seq_len, -1))        
        seq_emb = torch.cat([cate_emb, cont_emb], 2)        
        seq_emb = self.comb_proj(seq_emb)   
        mask, _ = mask.view(batch_size, half_seq_len, -1).max(2)
        
        encoded_layers = self.encoder(inputs_embeds=seq_emb, attention_mask=mask)
        sequence_output = encoded_layers[0]
        sequence_output = sequence_output[:, -1]        
        
        pred_y = self.reg_layer(sequence_output)
        return pred_y


class LSTMATTNModel(nn.Module):
    def __init__(self, cfg):
        super(LSTMATTNModel, self).__init__()
        self.cfg = cfg
        self.cate_emb = nn.Embedding(cfg.cate_vocab_size, cfg.emb_size, padding_idx=0)
        self.cate_proj = nn.Sequential(
            nn.Linear(cfg.emb_size * cfg.cate_col_size * cfg.n_rows_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        self.cont_bn = nn.BatchNorm1d(cfg.cont_col_size)
        self.cont_emb = nn.Sequential(
            nn.Linear(cfg.cont_col_size * cfg.n_rows_per_step, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        self.comb_proj = nn.Sequential(
            nn.ReLU(),
            nn.Linear(cfg.hidden_size * 2, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
        )
        # self.cnn = nn.Sequential(
        #     nn.Conv1d(cfg.hidden_size, cfg.hidden_size, kernel_size=3, padding=1, stride=1, bias=False),
        #     nn.BatchNorm1d(cfg.hidden_size),
        #     nn.ReLU(),
        #     nn.Dropout(cfg.dropout)
        # )
        self.encoder = nn.LSTM(cfg.hidden_size, cfg.hidden_size, 1,
                               bidirectional=False, dropout=cfg.dropout, batch_first=True)
        self.config = BertConfig( 
            3, # not used
            hidden_size=cfg.hidden_size,
            num_hidden_layers=1,
            num_attention_heads=cfg.nheads,
            intermediate_size=cfg.hidden_size,
            hidden_dropout_prob=cfg.dropout,
            attention_probs_dropout_prob=cfg.dropout,
        )
        self.attn = BertEncoder(self.config)                 
        def get_reg():
            return nn.Sequential(
            nn.Linear(cfg.hidden_size, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
            nn.Dropout(cfg.dropout),
            nn.ReLU(),
            nn.Linear(cfg.hidden_size, cfg.hidden_size),
            nn.LayerNorm(cfg.hidden_size),
            nn.Dropout(cfg.dropout),
            nn.ReLU(),
            nn.Linear(cfg.hidden_size, cfg.target_size),            
        )           
        self.reg_layer = get_reg()
    def forward(self, cate_x, cont_x, mask):        
        batch_size = cate_x.size(0)
        # ac->prev_ac, rel_ac->prev_rel_ac
        # cont_x[:, 1:, -2:] = cont_x[:, :-1, -2:].clone()
        # cont_x[:, 0, -2:] = 0
        cont_x = self.cont_bn(cont_x.view(-1, cont_x.size(-1))).view(batch_size, -1, cont_x.size(-1))
        half_seq_len = cate_x.size(1) // self.cfg.n_rows_per_step
        cate_emb = self.cate_emb(cate_x).view(batch_size, half_seq_len, -1)
        cate_emb = self.cate_proj(cate_emb)
        cont_emb = self.cont_emb(cont_x.view(batch_size, half_seq_len, -1))        
        seq_emb = torch.cat([cate_emb, cont_emb], 2)        
        seq_emb = self.comb_proj(seq_emb)   
        mask, _ = mask.view(batch_size, half_seq_len, -1).max(2)
        # seq_emb = self.cnn(seq_emb.transpose(1, 2).contiguous()).transpose(1, 2).contiguous()
        output, _ = self.encoder(seq_emb)
        extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        head_mask = [None] * self.config.num_hidden_layers
        encoded_layers = self.attn(output, extended_attention_mask, head_mask=head_mask)        
        sequence_output = encoded_layers[-1]
        sequence_output = sequence_output[:, -1]
        pred_y = self.reg_layer(sequence_output)
        return pred_y


def load_model(model_path, user_states, n_rows_per_step=2):
    # 미리 정의된 설정 값
    class CFG:
        learning_rate=1.0e-4 # 러닝 레이트
        batch_size=64 # 배치 사이즈
        num_workers=4 # 워커의 개수
        print_freq=50 # 결과 출력 빈도
        valid_freq=1
        start_epoch=0 # 시작 에폭
        #num_pretrain_epochs=10
        num_train_epochs=10 # 학습할 에폭수
        warmup_steps=10 # lr을 서서히 증가시킬 step 수
        max_grad_norm=10 # 그래디언트 클리핑에 사용
        weight_decay=0.01
        dropout=0.0 # dropout 확률    
        emb_size=200
        hidden_size=512 # 은닉 크기
        nlayers=2
        nheads=8
        seq_len=100
        cate_vocab_size=0
        cate_col_size=0
        cont_col_size=0
        target_size=1
        n_rows_per_step=2
    
    CFG.n_rows_per_step = n_rows_per_step  
    CFG.cate_vocab_size = max(list(user_states.cate2id_dict['qu_content2id'].values())) + 1
    print(CFG.cate_vocab_size)
    CFG.cate_col_size = len(user_states.cate_cols)
    CFG.cont_col_size = len(user_states.cont_cols)
    # 카테고리 분류기 모델을 생성합니다.
    
    tokens = model_path.split('/')[-1].split('_')
    seq_len = int([tok for tok in tokens if tok[:3]=='len'][0][3:])
    hidden_size = int([tok for tok in tokens if tok[0]=='h'][0][1:])
    architecture = [tok for tok in tokens if tok[0]=='a'][0][1:]
    cfg = CFG()
    cfg.hidden_size = hidden_size
    cfg.seq_len = seq_len
    cfg.encoder = architecture
    model = encoders[architecture](cfg)
    
    checkpoint = torch.load(model_path, map_location='cpu')
    state_dict = checkpoint['state_dict']
    
    model.load_state_dict(state_dict, strict=True)  
    print("=> loaded checkpoint '{}' (epoch {})"
          .format(model_path, checkpoint['epoch']))
    
    return model

encoders = {
    'LSTM':LSTMModel,
    'LSTMATTN':LSTMATTNModel,    
    'BERT':DSB_BertModel,
}