# T-Fixup
> 7강 T-Fixup을 베이스라인에 구현해보는 퀴즈의 정답 코드입니다! 아래 베이스라인은 그대로 실행시켜주시고 `T-Fixup` 파트에서 시작해주시면 됩니다!

## 베이스라인
> 처음에 주어지는 노트북 베이스라인 코드입니다!

In [1]:
!pip install easydict



In [2]:
import pandas as pd
import os
import re

import torch
import easydict
import numpy as np
from sklearn.preprocessing import LabelEncoder
import time
import datetime
from datetime import datetime
import random
import wandb

### 1. 데이터 로드 및 전처리 컴포넌트

In [3]:
import os
from datetime import datetime
import time
import tqdm
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch

class Preprocess:
    def __init__(self,args):
        self.args = args
        self.train_data = None
        self.test_data = None
        

    def get_train_data(self):
        return self.train_data

    def get_test_data(self):
        return self.test_data

    def split_data(self, data, ratio=0.7, shuffle=True, seed=0):
        """
        split data into two parts with a given ratio.
        """
        if shuffle:
            random.seed(seed) # fix to default seed 0
            random.shuffle(data)

        size = int(len(data) * ratio)
        data_1 = data[:size]
        data_2 = data[size:]

        return data_1, data_2

    def __save_labels(self, encoder, name):
        le_path = os.path.join(self.args.asset_dir, name + '_classes.npy')
        np.save(le_path, encoder.classes_)

    def __preprocessing(self, df, is_train = True):
        cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']

        if not os.path.exists(self.args.asset_dir):
            os.makedirs(self.args.asset_dir)
            
        for col in cate_cols:
            
            
            le = LabelEncoder()
            if is_train:
                #For UNKNOWN class
                a = df[col].unique().tolist() + ['unknown']
                le.fit(a)
                self.__save_labels(le, col)
            else:
                label_path = os.path.join(self.args.asset_dir,col+'_classes.npy')
                le.classes_ = np.load(label_path)
                
                df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')

            #모든 컬럼이 범주형이라고 가정
            df[col]= df[col].astype(str)
            test = le.transform(df[col])
            df[col] = test
            

        def convert_time(s):
            timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
            return int(timestamp)

        df['Timestamp'] = df['Timestamp'].apply(convert_time)
        
        return df

    def __feature_engineering(self, df):
        #TODO
        return df

    def load_data_from_file(self, file_name, is_train=True):
        csv_file_path = os.path.join(self.args.data_dir, file_name)
        df = pd.read_csv(csv_file_path)#, nrows=100000)
        df = self.__feature_engineering(df)
        df = self.__preprocessing(df, is_train)

        # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용

                
        self.args.n_questions = len(np.load(os.path.join(self.args.asset_dir,'assessmentItemID_classes.npy')))
        self.args.n_test = len(np.load(os.path.join(self.args.asset_dir,'testId_classes.npy')))
        self.args.n_tag = len(np.load(os.path.join(self.args.asset_dir,'KnowledgeTag_classes.npy')))
        


        df = df.sort_values(by=['userID','Timestamp'], axis=0)
        columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']
        group = df[columns].groupby('userID').apply(
                lambda r: (
                    r['testId'].values, 
                    r['assessmentItemID'].values,
                    r['KnowledgeTag'].values,
                    r['answerCode'].values
                )
            )

        return group.values

    def load_train_data(self, file_name):
        self.train_data = self.load_data_from_file(file_name)

    def load_test_data(self, file_name):
        self.test_data = self.load_data_from_file(file_name, is_train= False)




### 2. 데이터 셋 / 데이터 로더

In [4]:
class DKTDataset(torch.utils.data.Dataset):
    def __init__(self, data, args):
        self.data = data
        self.args = args

    def __getitem__(self, index):
        row = self.data[index]

        # 각 data의 sequence length
        seq_len = len(row[0])

        test, question, tag, correct = row[0], row[1], row[2], row[3]
        

        cate_cols = [test, question, tag, correct]

        # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
        if seq_len > self.args.max_seq_len:
            for i, col in enumerate(cate_cols):
                cate_cols[i] = col[-self.args.max_seq_len:]
            mask = np.ones(self.args.max_seq_len, dtype=np.int16)
        else:
            mask = np.zeros(self.args.max_seq_len, dtype=np.int16)
            mask[-seq_len:] = 1

        # mask도 columns 목록에 포함시킴
        cate_cols.append(mask)

        # np.array -> torch.tensor 형변환
        for i, col in enumerate(cate_cols):
            cate_cols[i] = torch.tensor(col)

        return cate_cols

    def __len__(self):
        return len(self.data)




def collate(batch):
    col_n = len(batch[0])
    col_list = [[] for _ in range(col_n)]
    max_seq_len = len(batch[0][-1])

        
    # batch의 값들을 각 column끼리 그룹화
    for row in batch:
        for i, col in enumerate(row):
            pre_padded = torch.zeros(max_seq_len)
            pre_padded[-len(col):] = col
            col_list[i].append(pre_padded)


    for i, _ in enumerate(col_list):
        col_list[i] =torch.stack(col_list[i])
    
    return tuple(col_list)


def get_loaders(args, train, valid):

    pin_memory = False
    train_loader, valid_loader = None, None
    
    if train is not None:
        trainset = DKTDataset(train, args)
        train_loader = torch.utils.data.DataLoader(trainset, num_workers=args.num_workers, shuffle=True,
                            batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)
    if valid is not None:
        valset = DKTDataset(valid, args)
        valid_loader = torch.utils.data.DataLoader(valset, num_workers=args.num_workers, shuffle=False,
                            batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)

    return train_loader, valid_loader

### 3. BERT 기반의 모델

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import copy
import math

try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel    
class Bert(nn.Module):

    def __init__(self, args):
        super(Bert, self).__init__()
        self.args = args
        self.device = args.device

        # Defining some parameters
        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers

        # Embedding 
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)

        
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        
        

        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)

        # embedding combination projection
        self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)

        # Bert config
        self.config = BertConfig( 
            3, # not used
            hidden_size=self.hidden_dim,
            num_hidden_layers=self.args.n_layers,
            num_attention_heads=self.args.n_heads,
            max_position_embeddings=self.args.max_seq_len          
        )

        # Defining the layers
        # Bert Layer
        self.encoder = BertModel(self.config)  

        # Fully connected layer
        self.fc = nn.Linear(self.args.hidden_dim, 1)
       
        self.activation = nn.Sigmoid()


    def forward(self, input):
        test, question, tag, _, mask, interaction, _ = input
        batch_size = interaction.size(0)

        # 신나는 embedding
        
        embed_interaction = self.embedding_interaction(interaction)
        
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        
        embed_tag = self.embedding_tag(tag)

        embed = torch.cat([embed_interaction,
        
                           embed_test,
                           embed_question,
        
                           embed_tag,], 2)

        X = self.comb_proj(embed)

        # Bert
        encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask)
        out = encoded_layers[0]

        out = out.contiguous().view(batch_size, -1, self.hidden_dim)

        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds


### 4. 모델 훈련을 위한 함수들

In [6]:
import os, sys

import numpy as np

import tarfile
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam, AdamW

from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import get_linear_schedule_with_warmup
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import scipy.stats


# 훈련을 하기 위한 세팅
def get_optimizer(model, args):
    if args.optimizer == 'adam':
        optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
    if args.optimizer == 'adamW':
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)
    
    # 모든 parameter들의 grad값을 0으로 초기화
    optimizer.zero_grad()
    
    return optimizer

def get_scheduler(optimizer, args):
    if args.scheduler == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer, patience=10, factor=0.5, mode='max', verbose=True)
    elif args.scheduler == 'linear_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=args.warmup_steps,
                                                    num_training_steps=args.total_steps)
    return scheduler

def get_criterion(pred, target):
    loss = nn.BCELoss(reduction="none")
    return loss(pred, target)

def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))

    return auc, acc

def get_model(args):
    """
    Load model and move tensors to a given devices.
    """
    if args.model == 'bert': model = Bert(args)
    

    model.to(args.device)

    return model


# 배치 전처리
def process_batch(batch, args):

    test, question, tag, correct, mask = batch
    
    
    # change to float
    mask = mask.type(torch.FloatTensor)
    correct = correct.type(torch.FloatTensor)

    #  interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
    #    saint의 경우 decoder에 들어가는 input이다
    interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
    interaction = interaction.roll(shifts=1, dims=1)
    interaction[:, 0] = 0 # set padding index to the first sequence
    interaction = (interaction * mask).to(torch.int64)
    # print(interaction)
    # exit()
    #  test_id, question_id, tag
    test = ((test + 1) * mask).to(torch.int64)
    question = ((question + 1) * mask).to(torch.int64)
    tag = ((tag + 1) * mask).to(torch.int64)

    # gather index
    # 마지막 sequence만 사용하기 위한 index
    gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
    gather_index = gather_index.view(-1, 1) - 1


    # device memory로 이동

    test = test.to(args.device)
    question = question.to(args.device)


    tag = tag.to(args.device)
    correct = correct.to(args.device)
    mask = mask.to(args.device)

    interaction = interaction.to(args.device)
    gather_index = gather_index.to(args.device)

    return (test, question,
            tag, correct, mask,
            interaction, gather_index)


# loss계산하고 parameter update!
def compute_loss(preds, targets):
    """
    Args :
        preds   : (batch_size, max_seq_len)
        targets : (batch_size, max_seq_len)

    """
    loss = get_criterion(preds, targets)
    #마지막 시퀀드에 대한 값만 loss 계산
    loss = loss[:,-1]
    loss = torch.mean(loss)
    return loss

def update_params(loss, model, optimizer, args):
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
    optimizer.step()
    optimizer.zero_grad()



def save_checkpoint(state, model_dir, model_filename):
    print('saving model ...')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)    
    torch.save(state, os.path.join(model_dir, model_filename))



def load_model(args):
    
    
    model_path = os.path.join(args.model_dir, args.model_name)
    print("Loading Model from:", model_path)
    load_state = torch.load(model_path)
    model = get_model(args)

    # 1. load model state
    model.load_state_dict(load_state['state_dict'], strict=True)
   
    
    print("Loading Model from:", model_path, "...Finished.")
    return model



### 5. 전체 프로세스를 담당하는 함수들

In [7]:

def run(args, train_data, valid_data):
    train_loader, valid_loader = get_loaders(args, train_data, valid_data)
    
    # only when using warmup scheduler
    args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)
    args.warmup_steps = args.total_steps // 10
            
    model = get_model(args)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)

    best_auc = -1
    early_stopping_counter = 0
    for epoch in range(args.n_epochs):

        print(f"Start Training: Epoch {epoch + 1}")
        
        ### TRAIN
        train_auc, train_acc, train_loss = train(train_loader, model, optimizer, args)
        
        ### VALID
        auc, acc, _, _ = validate(valid_loader, model, args)

        ### TODO: model save or early stopping
        wandb.log({"epoch": epoch, "train_loss": train_loss, "train_auc": train_auc, "train_acc":train_acc,
                  "valid_auc":auc, "valid_acc":acc})
        if auc > best_auc:
            best_auc = auc
            # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.
            model_to_save = model.module if hasattr(model, 'module') else model
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
                },
                args.model_dir, 'model.pt',
            )
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= args.patience:
                print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')
                break

        # scheduler
        if args.scheduler == 'plateau':
            scheduler.step(best_auc)
        else:
            scheduler.step()


def train(train_loader, model, optimizer, args):
    model.train()

    total_preds = []
    total_targets = []
    losses = []
    for step, batch in enumerate(train_loader):
        input = process_batch(batch, args)
        preds = model(input)
        targets = input[3] # correct


        loss = compute_loss(preds, targets)
        update_params(loss, model, optimizer, args)

        if step % args.log_steps == 0:
            print(f"Training steps: {step} Loss: {str(loss.item())}")
        
        # predictions
        preds = preds[:,-1]
        targets = targets[:,-1]

        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()
        
        total_preds.append(preds)
        total_targets.append(targets)
        losses.append(loss)
      

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)
    loss_avg = sum(losses)/len(losses)
    print(f'TRAIN AUC : {auc} ACC : {acc}')
    return auc, acc, loss_avg
    

def validate(valid_loader, model, args):
    model.eval()

    total_preds = []
    total_targets = []
    for step, batch in enumerate(valid_loader):
        input = process_batch(batch, args)

        preds = model(input)
        targets = input[3] # correct


        # predictions
        preds = preds[:,-1]
        targets = targets[:,-1]
    
        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
            targets = targets.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            targets = targets.detach().numpy()

        total_preds.append(preds)
        total_targets.append(targets)

    total_preds = np.concatenate(total_preds)
    total_targets = np.concatenate(total_targets)

    # Train AUC / ACC
    auc, acc = get_metric(total_targets, total_preds)
    
    print(f'VALID AUC : {auc} ACC : {acc}\n')

    return auc, acc, total_preds, total_targets



def inference(args, test_data):
    
    model = load_model(args)
    model.eval()
    _, test_loader = get_loaders(args, None, test_data)
    
    
    total_preds = []
    
    for step, batch in enumerate(test_loader):
        input = process_batch(batch, args)

        preds = model(input)

        # predictions
        preds = preds[:,-1]
        
        if args.device == 'cuda':
            preds = preds.to('cpu').detach().numpy()
        else: # cpu
            preds = preds.detach().numpy()
            
        total_preds+=list(preds)

    write_path = os.path.join(args.output_dir, "output.csv")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)    
    with open(write_path, 'w', encoding='utf8') as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(total_preds):
            w.write('{},{}\n'.format(id,p))




### 6.실행부분

In [8]:
data_dir = '/opt/ml/input/data/train_dataset'
file_name = 'train_data.csv'
test_file_name = 'test_data.csv'

config = {}

# 설정
config['seed'] = 42
config['device'] = "cuda" if torch.cuda.is_available() else "cpu"
config['data_dir'] = data_dir
config['asset_dir'] = 'asset'
config['model_dir'] = 'models'
config['model_name'] = 'model.pt'
config['output_dir'] = 'output'

# 데이터
config['max_seq_len'] = 20
config['num_workers'] = 1


# 모델
config['hidden_dim'] = 64
config['n_layers'] = 2
config['n_heads'] = 4
config['dropout'] = 0.2

# 훈련
config['n_epochs'] = 20
config['batch_size'] = 64
config['lr'] = 0.0001
config['clip_grad'] = 10
config['log_steps'] = 50
config['patience'] = 5



### 중요 ###
config['model'] = 'bert'
config['optimizer'] = 'adam'
config['scheduler'] = 'plateau'


args = easydict.EasyDict(config)

In [9]:
def setSeeds(seed = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)    
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [10]:
setSeeds(42)

preprocess = Preprocess(args)
preprocess.load_train_data(file_name)

train_data = preprocess.get_train_data()
train_data, valid_data = preprocess.split_data(train_data)


## T-Fixup

### T-Fixup 코드
> 조금 더 자세한 설명은 7강 가중치 초기화 (Weight Initialization)을 참조하세요!

```python
class YourModel:
    def __init__(self):
        # T-Fixup
        if self.args.Tfixup:

            # 초기화 (Initialization)
            self.tfixup_initialization()
            print("T-Fixup Initialization Done")

            # 스케일링 (Scaling)
            self.tfixup_scaling()
            print(f"T-Fixup Scaling Done")

    def tfixup_initialization(self):
        # 우리는 padding idx의 경우 모두 0으로 통일한다
        padding_idx = 0

        for name, param in self.named_parameters():
            if re.match(r'^embedding*', name):
                nn.init.normal_(param, mean=0, std=param.shape[1] ** -0.5)
                nn.init.constant_(param[padding_idx], 0)
            elif re.match(r'.*Norm.*', name):
                continue
            elif re.match(r'.*weight*', name):
                # nn.init.xavier_uniform_(param)
                nn.init.xavier_normal_(param)


    def tfixup_scaling(self):
        temp_state_dict = {}

        # 특정 layer들의 값을 스케일링한다
        for name, param in self.named_parameters():

            # TODO: 모델 내부의 module 이름이 달라지면 직접 수정해서
            #       module이 scaling 될 수 있도록 변경해주자
            # print(name)

            if re.match(r'^embedding*', name):
                temp_state_dict[name] = (9 * self.args.n_layers) ** (-1 / 4) * param   
            elif re.match(r'.*Norm.*', name):
                continue
            elif re.match(r'encoder.*dense.*weight$|encoder.*attention.output.*weight$', name):
                temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * param
            elif re.match(r"encoder.*value.weight$", name):
                temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * (param * (2**0.5))

        # 나머지 layer는 원래 값 그대로 넣는다
        for name in self.state_dict():
            if name not in temp_state_dict:
                temp_state_dict[name] = self.state_dict()[name]
                
        self.load_state_dict(temp_state_dict)
```

### T-Fixup Bert 베이스라인 모델에 적용해보기

In [11]:
class TfixupBert(nn.Module):
    def __init__(self, args):
        super(TfixupBert, self).__init__()
        self.args = args
        self.device = args.device

        # Defining some parameters
        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers

        # Embedding 
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)

        # embedding combination projection
        self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)

        # Bert config
        self.config = BertConfig( 
            3, # not used
            hidden_size=self.hidden_dim,
            num_hidden_layers=self.args.n_layers,
            num_attention_heads=self.args.n_heads,
            max_position_embeddings=self.args.max_seq_len          
        )

        # Defining the layers
        # Bert Layer
        self.encoder = BertModel(self.config)  

        # Fully connected layer
        self.fc = nn.Linear(self.args.hidden_dim, 1)
       
        self.activation = nn.Sigmoid()
        
        # T-Fixup
        if self.args.Tfixup:

            # 초기화 (Initialization)
            self.tfixup_initialization()
            print("T-Fixupbb Initialization Done")

            # 스케일링 (Scaling)
            self.tfixup_scaling()
            print(f"T-Fixup Scaling Done")

    def tfixup_initialization(self):
        # 우리는 padding idx의 경우 모두 0으로 통일한다
        padding_idx = 0

        for name, param in self.named_parameters():
            if re.match(r'^embedding*', name):
                nn.init.normal_(param, mean=0, std=param.shape[1] ** -0.5)
                nn.init.constant_(param[padding_idx], 0)
            elif re.match(r'.*Norm.*', name):
                continue
            elif re.match(r'.*weight*', name):
                # nn.init.xavier_uniform_(param)
                nn.init.xavier_normal_(param)


    def tfixup_scaling(self):
        temp_state_dict = {}

        # 특정 layer들의 값을 스케일링한다
        for name, param in self.named_parameters():

            # TODO: 모델 내부의 module 이름이 달라지면 직접 수정해서
            #       module이 scaling 될 수 있도록 변경해주자
            # print(name)

            if re.match(r'^embedding*', name):
                temp_state_dict[name] = (9 * self.args.n_layers) ** (-1 / 4) * param   
            elif re.match(r'.*Norm.*', name):
                continue
            elif re.match(r'encoder.*dense.*weight$|encoder.*attention.output.*weight$', name):
                temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * param
            elif re.match(r"encoder.*value.weight$", name):
                temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * (param * (2**0.5))

        # 나머지 layer는 원래 값 그대로 넣는다
        for name in self.state_dict():
            if name not in temp_state_dict:
                temp_state_dict[name] = self.state_dict()[name]
                
        self.load_state_dict(temp_state_dict)


    def forward(self, input):
        test, question, tag, _, mask, interaction, _ = input
        batch_size = interaction.size(0)

        # 신나는 embedding
        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)

        embed = torch.cat([embed_interaction,
                           embed_test,
                           embed_question,
                           embed_tag,], 2)

        X = self.comb_proj(embed)

        # Bert
        encoded_layers = self.encoder(inputs_embeds=X, attention_mask=mask)
        out = encoded_layers[0]

        out = out.contiguous().view(batch_size, -1, self.hidden_dim)
        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds

In [12]:
def get_model(args):
    """
    Load model and move tensors to a given devices.
    """
    if args.model == 'bert': model = Bert(args)
    if args.model == 'tfixup_bert': model = TfixupBert(args)

    model.to(args.device)

    return model

### Bert 훈련시키기 (20 Layers)

In [13]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbomichoi[0m (use `wandb login --relogin` to force relogin)


True

In [14]:
wandb.init(project='dkt', config=config)

[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [15]:
args.model = 'bert'
args.n_layers = 20

run(args, train_data, valid_data)

Start Training: Epoch 1
Training steps: 0 Loss: 0.7601020336151123
Training steps: 50 Loss: 0.6398704648017883
TRAIN AUC : 0.6701844367845223 ACC : 0.638438566552901
VALID AUC : 0.7082144405069719 ACC : 0.6587064676616915

saving model ...
Start Training: Epoch 2
Training steps: 0 Loss: 0.5946140289306641
Training steps: 50 Loss: 0.6746055483818054
TRAIN AUC : 0.711125920428988 ACC : 0.6565699658703071
VALID AUC : 0.7112642460632713 ACC : 0.6626865671641791

saving model ...
Start Training: Epoch 3
Training steps: 0 Loss: 0.6111980676651001
Training steps: 50 Loss: 0.6411390900611877
TRAIN AUC : 0.7149466636642852 ACC : 0.6604095563139932
VALID AUC : 0.7030173734777008 ACC : 0.6358208955223881

Start Training: Epoch 4
Training steps: 0 Loss: 0.6101197004318237
Training steps: 50 Loss: 0.5977492928504944
TRAIN AUC : 0.7167941311341801 ACC : 0.6599829351535836
VALID AUC : 0.7139570012129796 ACC : 0.6611940298507463

saving model ...
Start Training: Epoch 5
Training steps: 0 Loss: 0.58616

### T-Fixup Bert 훈련시키기 (20 Layers)

In [16]:
args.model = 'tfixup_bert'
args.Tfixup = True
args.epoch = 30
args.n_layers = 20

run(args, train_data, valid_data)

T-Fixupbb Initialization Done
T-Fixup Scaling Done
Start Training: Epoch 1
Training steps: 0 Loss: 0.9224938154220581
Training steps: 50 Loss: 0.6830382347106934
TRAIN AUC : 0.5391839052900566 ACC : 0.5305034129692833
VALID AUC : 0.686463128098776 ACC : 0.5208955223880597

saving model ...
Start Training: Epoch 2
Training steps: 0 Loss: 0.6878604292869568
Training steps: 50 Loss: 0.6104351878166199
TRAIN AUC : 0.6254563057761505 ACC : 0.5910836177474402
VALID AUC : 0.7150985707073864 ACC : 0.5323383084577115

saving model ...
Start Training: Epoch 3
Training steps: 0 Loss: 0.7068123817443848
Training steps: 50 Loss: 0.7003744840621948
TRAIN AUC : 0.683059183686493 ACC : 0.6348122866894198
VALID AUC : 0.7241597165813217 ACC : 0.6537313432835821

saving model ...
Start Training: Epoch 4
Training steps: 0 Loss: 0.7406575083732605
Training steps: 50 Loss: 0.6597524285316467
TRAIN AUC : 0.7071262194516035 ACC : 0.6569965870307167
VALID AUC : 0.7277401387140829 ACC : 0.6751243781094527

savi

## Transformer Encoder에 T-Fixup 적용해보기
> Hugging Face의 Bert를 사용하면 `Layer Norm`을 사용하지 않기가 어렵다! 직접 Bert 내부를 구현하면서 모델을 좀 더 custom하게 다뤄보자!

### T-Fixup 모델 생성
- Encoder Layer
- Fixup Encoder

#### ☘️ Encoder Layer

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import copy
import math

class EncoderLayer(nn.Module):
    def __init__(self, args):
        super(EncoderLayer, self).__init__()
        self.args = args
        self.device = args.device

        # Defining some parameters
        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers

        self.query = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
        self.key = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
        self.value = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)

        self.attn = nn.MultiheadAttention(embed_dim=self.hidden_dim, num_heads=self.args.n_heads)

        self.ffn1 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
        self.ffn2 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)   

        if self.args.layer_norm:
            self.ln1 = nn.LayerNorm(self.hidden_dim)
            self.ln2 = nn.LayerNorm(self.hidden_dim)


    def forward(self, embed, mask):
        q = self.query(embed).permute(1, 0, 2)
        k = self.key(embed).permute(1, 0, 2)
        v = self.value(embed).permute(1, 0, 2)

        ## attention
        out, _ = self.attn(q, k, v, attn_mask=mask)
        
        ## residual + layer norm
        out = out.permute(1, 0, 2)
        out = embed + out
        
        if self.args.layer_norm:
            out = self.ln1(out)

        ## feed forward network
        out = self.ffn1(out)
        out = F.relu(out)
        out = self.ffn2(out)

        ## residual + layer norm
        out = embed + out

        if self.args.layer_norm:
            out = self.ln2(out)

        return out


#### ☘️ Fixup Encoder

In [18]:

class FixupEncoder(nn.Module):
    def __init__(self, args):
        super(FixupEncoder, self).__init__()
        self.args = args
        self.device = args.device

        # Defining some parameters
        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers

        # Embedding 
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
        self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)

        # embedding combination projection
        self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)
        
        # Encoder
        self.encoders = nn.ModuleList([EncoderLayer(args) for _ in range(self.n_layers)])

        # Fully connected layer
        self.fc = nn.Linear(self.args.hidden_dim, 1)

        self.activation = nn.Sigmoid()

        # T-Fixup
        if self.args.Tfixup:

            # 초기화 (Initialization)
            self.tfixup_initialization()
            print("T-Fixup Initialization Done")

            # 스케일링 (Scaling)
            self.tfixup_scaling()
            print(f"T-Fixup Scaling Done")

    def tfixup_initialization(self):
        # 우리는 padding idx의 경우 모두 0으로 통일한다
        padding_idx = 0

        for name, param in self.named_parameters():
            if re.match(r'^embedding*', name):
                nn.init.normal_(param, mean=0, std=param.shape[1] ** -0.5)
                nn.init.constant_(param[padding_idx], 0)
            elif re.match(r'.*ln.*|.*bn.*', name):
                continue
            elif re.match(r'.*weight*', name):
                # scaling없이 아래 가중치 초기화만 사용하면
                # AUC가 0.5로 성능이 심각하게 저하된다
                # nn.init.xavier_uniform_(param)
                nn.init.xavier_normal_(param)
                # nn.init.kaiming_uniform_(param)
                # nn.init.kaiming_normal_(param)




    def tfixup_scaling(self):
        temp_state_dict = {}

        # 특정 layer들의 값을 스케일링한다
        for name, param in self.named_parameters():

            # TODO: 모델 내부의 module 이름이 달라지면 직접 수정해서
            #       module이 scaling 될 수 있도록 변경해주자
            # print(name)

            if re.match(r'^embedding*', name):
                temp_state_dict[name] = (9 * self.args.n_layers) ** (-1 / 4) * param          
            elif re.match(r'encoder.*ffn.*weight$|.*attn.out_proj.weight$', name):
                temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * param
            elif re.match(r".*value.weight$", name):
                temp_state_dict[name] = (0.67 * (self.args.n_layers) ** (-1 / 4)) * (param * (2**0.5))

        # 나머지 layer는 원래 값 그대로 넣는다
        for name in self.state_dict():
            if name not in temp_state_dict:
                temp_state_dict[name] = self.state_dict()[name]

        self.load_state_dict(temp_state_dict)

    def mask_2d_to_3d(self, mask, batch_size, seq_len):
        # padding 부분에 1을 주기 위해 0과 1을 뒤집는다
        mask = torch.ones_like(mask) - mask
        
        mask = mask.repeat(1, seq_len)
        mask = mask.view(batch_size, -1, seq_len)
        mask = mask.repeat(1, self.args.n_heads, 1)
        mask = mask.view(batch_size*self.args.n_heads, -1, seq_len)

        return mask.masked_fill(mask==1, float('-inf'))

    def forward(self, input):
        test, question, tag, _, mask, interaction, _ = input
        batch_size = interaction.size(0)
        seq_len = interaction.size(1)

        # 신나는 embedding
        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)

        embed = torch.cat([embed_interaction,
                           embed_test,
                           embed_question,
                           embed_tag,], 2)

        embed = self.comb_proj(embed)

        ### Encoder
        mask = self.mask_2d_to_3d(mask, batch_size, seq_len).to(self.device)
        for encoder in self.encoders:
            embed = encoder(embed, mask)

        ###################### DNN #####################
        out = embed.contiguous().view(batch_size, -1, self.hidden_dim)
        out = self.fc(out)

        preds = self.activation(out).view(batch_size, -1)
        return preds


In [19]:
def get_model(args):
    """
    Load model and move tensors to a given devices.
    """
    if args.model == 'bert': model = Bert(args)
    if args.model == 'tfixup_bert': model = TfixupBert(args)
    if args.model == 'tfixup': model = FixupEncoder(args)

    model.to(args.device)

    return model

In [20]:
args.model = 'tfixup'
args.n_layers = 1

### T-Fixup 모델 사용 (1 Layers)

#### ☘️ Vanilla Encoder
> 우리가 기본 세팅으로 사용하는 Transformer 인코더다. 결과 비교를 위해 사용하자.

In [21]:
args.Tfixup = False
args.layer_norm = True

setSeeds(42)
report = run(args, train_data, valid_data)

Start Training: Epoch 1
Training steps: 0 Loss: 0.6966268420219421
Training steps: 50 Loss: 0.6849297285079956
TRAIN AUC : 0.5990284499962166 ACC : 0.574018771331058
VALID AUC : 0.6465815894892294 ACC : 0.6213930348258706

saving model ...
Start Training: Epoch 2
Training steps: 0 Loss: 0.6233084201812744
Training steps: 50 Loss: 0.6410101652145386
TRAIN AUC : 0.6582421299162097 ACC : 0.6224402730375427
VALID AUC : 0.6625833985446228 ACC : 0.6288557213930348

saving model ...
Start Training: Epoch 3
Training steps: 0 Loss: 0.6255315542221069
Training steps: 50 Loss: 0.6491870284080505
TRAIN AUC : 0.6742461302553452 ACC : 0.6294795221843004
VALID AUC : 0.6723725305253302 ACC : 0.6388059701492538

saving model ...
Start Training: Epoch 4
Training steps: 0 Loss: 0.6515625715255737
Training steps: 50 Loss: 0.6601109504699707
TRAIN AUC : 0.6851740393670567 ACC : 0.6427047781569966
VALID AUC : 0.6787359622161325 ACC : 0.6442786069651741

saving model ...
Start Training: Epoch 5
Training step

#### ☘️ Vanilla Encoder without LayerNorm
> T-Fixup의 장점 중 하나는 가중치를 조절하여 학습이 원활하게 되어 Layer Norm의 도움이 필요없다는 것이다. 과연 T-Fixup 없이 Vanilla Transformer Encoder에서는 Layer Norm이 없다면 어떤 현상이 일어나는지 살펴보자

In [22]:
args.Tfixup = False
args.layer_norm = False

setSeeds(42)
report = run(args, train_data, valid_data)

Start Training: Epoch 1
Training steps: 0 Loss: 0.6926538944244385
Training steps: 50 Loss: 0.6850431561470032
TRAIN AUC : 0.5822891092863844 ACC : 0.5601535836177475
VALID AUC : 0.6392561053140011 ACC : 0.6154228855721393

saving model ...
Start Training: Epoch 2
Training steps: 0 Loss: 0.6339619159698486
Training steps: 50 Loss: 0.6439831256866455
TRAIN AUC : 0.6534924109336526 ACC : 0.6243600682593856
VALID AUC : 0.6573228558875134 ACC : 0.6373134328358209

saving model ...
Start Training: Epoch 3
Training steps: 0 Loss: 0.633604884147644
Training steps: 50 Loss: 0.6509594917297363
TRAIN AUC : 0.6642997081794537 ACC : 0.6350255972696246
VALID AUC : 0.6637155458755223 ACC : 0.6442786069651741

saving model ...
Start Training: Epoch 4
Training steps: 0 Loss: 0.649529218673706
Training steps: 50 Loss: 0.6621315479278564
TRAIN AUC : 0.6721720255226743 ACC : 0.6397184300341296
VALID AUC : 0.6687980592326788 ACC : 0.6417910447761194

saving model ...
Start Training: Epoch 5
Training steps

#### ☘️ T-Fixup Encoder
> T-Fixup을 적용할 경우 Layer Norm없이 사용할 수 있다고 하지만, 과연 Layer Norm이 있다면 어떤 현상이 일어날까? 직접 알아보자!

In [23]:
args.Tfixup = True
args.layer_norm = True

setSeeds(42)
report = run(args, train_data, valid_data)

T-Fixup Initialization Done
T-Fixup Scaling Done
Start Training: Epoch 1
Training steps: 0 Loss: 0.9537209272384644
Training steps: 50 Loss: 0.7271689176559448
TRAIN AUC : 0.631950748787272 ACC : 0.6043088737201365
VALID AUC : 0.6587619673874126 ACC : 0.6094527363184079

saving model ...
Start Training: Epoch 2
Training steps: 0 Loss: 0.589790940284729
Training steps: 50 Loss: 0.5833766460418701
TRAIN AUC : 0.6937965570463129 ACC : 0.6448378839590444
VALID AUC : 0.6874827053709307 ACC : 0.6338308457711442

saving model ...
Start Training: Epoch 3
Training steps: 0 Loss: 0.6588139533996582
Training steps: 50 Loss: 0.6599853038787842
TRAIN AUC : 0.72067714035921 ACC : 0.6702218430034129
VALID AUC : 0.7025541997558172 ACC : 0.654228855721393

saving model ...
Start Training: Epoch 4
Training steps: 0 Loss: 0.5958243608474731
Training steps: 50 Loss: 0.6717612147331238
TRAIN AUC : 0.7388183319096331 ACC : 0.6832337883959044
VALID AUC : 0.7114130170660176 ACC : 0.6577114427860696

saving mo

#### ☘️ T-Fixup Encoder without LayerNorm
> 우리가 보고자 하던 진짜 T-Fixup 세팅이다. 어떤 효과가 있는지 확인해보자!

In [24]:
args.Tfixup = True
args.layer_norm = False

setSeeds(42)
report = run(args, train_data, valid_data)

T-Fixup Initialization Done
T-Fixup Scaling Done
Start Training: Epoch 1
Training steps: 0 Loss: 0.7209408283233643
Training steps: 50 Loss: 0.6976499557495117
TRAIN AUC : 0.5798749574941175 ACC : 0.552901023890785
VALID AUC : 0.6350072054755663 ACC : 0.6034825870646766

saving model ...
Start Training: Epoch 2
Training steps: 0 Loss: 0.6573376059532166
Training steps: 50 Loss: 0.6300456523895264
TRAIN AUC : 0.6458313238952801 ACC : 0.6316126279863481
VALID AUC : 0.6596025235529293 ACC : 0.6457711442786069

saving model ...
Start Training: Epoch 3
Training steps: 0 Loss: 0.641427755355835
Training steps: 50 Loss: 0.6824603080749512
TRAIN AUC : 0.6639887428925151 ACC : 0.6392918088737202
VALID AUC : 0.6720432507059184 ACC : 0.6472636815920398

saving model ...
Start Training: Epoch 4
Training steps: 0 Loss: 0.677515983581543
Training steps: 50 Loss: 0.6937620639801025
TRAIN AUC : 0.6810587406224957 ACC : 0.6405716723549488
VALID AUC : 0.6829491570139081 ACC : 0.6482587064676617

saving 