### Set Env

In [None]:
! pip install ipywidgets
! pip install transformers
! pip install wandb
! pip install adamp
! mkdir custom_data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os
import time
import math
import glob
import pickle
import random
import argparse
from pathlib import Path
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import KFold, StratifiedKFold
from torch.autograd import Variable
from torch.optim.lr_scheduler import _LRScheduler
from adamp import AdamP
import transformers
import wandb
import warnings

# Ignore Warnings
warnings.filterwarnings(action='ignore')

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device : {device}')
print(torch.cuda.get_device_properties(device))

# Set ROOT_PATH
ROOT_PATH = os.getcwd()
print(f'ROOT_PATH : {ROOT_PATH}')
    
# Set wandb
wandb.login()
CFG = wandb.config
%env WANDB_PROJECT = P2
%env WANDB_LOG_MODEL = true
%env WANDB_SILENT = true

device : cuda:0
_CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
ROOT_PATH : /content


[34m[1mwandb[0m: Currently logged in as: [33mhkl[0m (use `wandb login --relogin` to force relogin)


env: WANDB_PROJECT=P2
env: WANDB_LOG_MODEL=true
env: WANDB_SILENT=true


In [None]:
# Set Experiment
CFG.name = 'Baseline'
CFG.tag = ['Baseline']
CFG.NUM_FOLD = 5
CFG.FOLD = range(CFG.NUM_FOLD)   # if you want to do just simple test, set this [0]

CFG.MODEL_NAME = "xlm-roberta-large"
CFG.lr = 3e-5
CFG.batch_size = 50
CFG.epochs = 10
CFG.classifier_dropout = 0.1
CFG.tokenizer_max_length = 150
CFG.weight_decay = 0.00
CFG.seed = 42
CFG.random_masking_rate = 0.1

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG.seed)

In [None]:
# Set Directory
if not os.path.isdir(f'custom_data/{CFG.name}') :
    os.chdir(os.path.join(ROOT_PATH, 'custom_data'))
    os.mkdir(f'{CFG.name}')
    os.chdir(ROOT_PATH)

### Set Dataset

In [None]:
origin_dataset = pd.read_csv(os.path.join(ROOT_PATH, 'drive/MyDrive/P2/train.tsv'), delimiter='\t', header=None)
en_dataset = pd.read_csv(os.path.join(ROOT_PATH, 'drive/MyDrive/P2/new_data_NaN_en.tsv'), delimiter='\t', header=None)
ja_dataset = pd.read_csv(os.path.join(ROOT_PATH, 'drive/MyDrive/P2/new_data_NaN_ja.tsv'), delimiter='\t', header=None) 
zh_dataset = pd.read_csv(os.path.join(ROOT_PATH, 'drive/MyDrive/P2/new_data_NaN_zh.tsv'), delimiter='\t', header=None)

In [None]:
with open(os.path.join(ROOT_PATH, 'drive/MyDrive/P2/label_type.pkl'), 'rb') as f:
    label_type = pickle.load(f)

error_label_0 = ['wikitree-12599-4-108-111-4-7',
                 'wikipedia-25967-115-24-26-35-37',
                 'wikipedia-16427-6-14-17-20-22',
                 'wikipedia-16427-8-0-3-26-28',
                 'wikitree-19765-5-30-33-6-8',
                 'wikitree-58702-0-18-20-22-24',
                 'wikitree-71638-8-21-23-15-17',
                 'wikipedia-257-0-0-1-53-57',
                 'wikipedia-13649-28-66-70-14-24',
                 'wikipedia-6017-8-20-26-4-7']
error_label_1 = ['wikitree-55837-4-0-2-10-11']
error_label_2 = ['wikitree-62775-3-3-7-0-2']
error_label_3 = ['wikipedia-23188-0-74-86-41-42']

labels = []
for ID, i in zip(origin_dataset[0], origin_dataset[8]):
    if i == 'blind':
        labels.append(100)
    elif ID in error_label_0:
        labels.append(label_type['관계_없음'])
    elif ID in error_label_1:
        labels.append(label_type['단체:구성원'])
    elif ID in error_label_2:
        labels.append(label_type['단체:본사_도시'])
    elif ID in error_label_3:
        labels.append(label_type['단체:하위_단체'])
    else:
        labels.append(label_type[i])

In [None]:
dataset = pd.DataFrame({
    'origin_sentence' : origin_dataset[1],
    'en_sentence' : en_dataset[1],
    'ja_sentence' : ja_dataset[1],
    'zh_sentence' : zh_dataset[1],
    'entity_01' : origin_dataset[2],
    'entity_02' : origin_dataset[5],
    'labels' : labels
})

In [None]:
class NLPDataset(Dataset) : 
    def __init__(self, dataset, tokenizer, training=False, threshold=0.1) :
        self.origin_sentence = dataset['origin_sentence']
        self.en_sentence = dataset['en_sentence']
        self.ja_sentence = dataset['ja_sentence']
        self.zh_sentence = dataset['zh_sentence']
        self.entity_01 = dataset['entity_01']
        self.entity_02 = dataset['entity_02']
        self.labels = torch.tensor(dataset['labels'])
        self.tokenizer = tokenizer
        self.training = training
        self.threshold = threshold
        
    def __getitem__(self, idx) :
        if self.training :
            sentences = [self.origin_sentence[idx], ]
            if self._is_sentence(self.en_sentence[idx]) :
                sentences.append(self.en_sentence[idx])
            if self._is_sentence(self.ja_sentence[idx]) :
                sentences.append(self.ja_sentence[idx])
            if self._is_sentence(self.zh_sentence[idx]) :
                sentences.append(self.zh_sentence[idx])
            sentence = sentences[np.random.randint(len(sentences))]
        else :
            sentence = self.origin_sentence[idx]
        
        e1 = self.entity_01[idx]
        e2 = self.entity_02[idx]
        e1_mask, e2_mask = self._get_ent_mask(e1, e2)

        item = tokenizer(e1+' RELATION '+e2, sentence, max_length=CFG.tokenizer_max_length, 
                         padding='max_length', truncation=True, return_tensors='pt')
        item['input_ids'] = self._random_mask(item['input_ids'][0], e1, e2)
        item['attention_mask'] = item['attention_mask'].squeeze(0)
        item['e1_mask'] = torch.Tensor(e1_mask)
        item['e2_mask'] = torch.Tensor(e2_mask)
        item['labels'] = self.labels[idx]
        return item
        
    def __len__(self) :
        return len(self.labels)
    
    def _random_mask(self, sentence, e1, e2) :
        mask_id = self.tokenizer.encode('<mask>', add_special_tokens=False)
        important_tokens = self.tokenizer.all_special_ids
        important_tokens += tokenizer.encode('RELATION', add_special_tokens=False)
        important_tokens += tokenizer.encode(e1, e2, add_special_tokens=False)
        for i, token in enumerate(sentence) :
            if int(token) not in list(important_tokens) and self.threshold > random.random() :
                sentence[i] = mask_id[0]
        return sentence
    
    def _is_sentence(self, sentence) :
        return False if sentence is np.NaN else True
    
    def _get_ent_mask(self, e1, e2) :
        e1_mask = np.zeros(CFG.tokenizer_max_length, dtype=int)
        e2_mask = np.zeros(CFG.tokenizer_max_length, dtype=int)
        e1_len = len(self.tokenizer.encode(e1, add_special_tokens=False))
        e2_len = len(self.tokenizer.encode(e2, add_special_tokens=False))
        e1_mask[1 : 1+e1_len] = 1
        e2_mask[3+e1_len : 3+e1_len+e2_len] = 1
        return e1_mask, e2_mask

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
train_dataset = NLPDataset(dataset, tokenizer, training=True, threshold=CFG.random_masking_rate)
valid_dataset = NLPDataset(dataset, tokenizer, training=False, threshold=0.0)

### Set Validation

In [None]:
skf = StratifiedKFold(n_splits=CFG.NUM_FOLD, shuffle=True, random_state=CFG.seed)
folds = []
for train_idx, valid_idx in skf.split(origin_dataset, labels) :
    folds.append({'train_idx':train_idx, 'valid_idx':valid_idx})

### Define Model

In [None]:
### model architecture
class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)

class RBERT_RobertaForSequenceClassification(nn.Module):
    def __init__(self, model_name, num_classes, dr_rate=0.1):
        super(RBERT_RobertaForSequenceClassification, self).__init__()
        
        config = transformers.AutoConfig.from_pretrained(model_name)
        config.num_labels = config.hidden_size
        self.backbone = transformers.AutoModel.from_pretrained(model_name, config=config)
        self.num_classes = num_classes
        self.dropout_rate = dr_rate
        
        self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, self.dropout_rate)
        self.entity_fc_layer = FCLayer(config.hidden_size, config.hidden_size, self.dropout_rate)
        self.label_classifier = FCLayer(config.hidden_size*3, self.num_classes, self.dropout_rate, use_activation=False)
        
    def forward(self, input_ids, attention_mask, e1_mask, e2_mask, labels=None):
        outputs = self.backbone(input_ids=input_ids,
                                attention_mask=attention_mask)
        
        sequence_output = outputs['last_hidden_state']
        pooled_output = outputs['pooler_output']  # [CLS]
        
        e1_h = self.entity_average(sequence_output, e1_mask)
        e2_h = self.entity_average(sequence_output, e2_mask)
        
        pooled_output = self.cls_fc_layer(pooled_output)
        e1_h = self.entity_fc_layer(e1_h)
        e2_h = self.entity_fc_layer(e2_h)
        
        concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1)
        logits = self.label_classifier(concat_h)
        return logits
    
    def entity_average(self, hidden_output, e_mask):
        """
        Average the entity hidden state vectors (H_i ~ H_j)
        :param hidden_output: [batch_size, j-i+1, dim]
        :param e_mask: [batch_size, max_seq_len]
                e.g. e_mask[0] == [0, 0, 0, 1, 1, 1, 0, 0, ... 0]
        :return: [batch_size, dim]
        """
        e_mask_unsqueeze = e_mask.unsqueeze(1)  # [b, 1, j-i+1]
        length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1)  # [batch_size, 1]

        # [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
        sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
        avg_vector = sum_vector.float() / length_tensor.float()  # broadcasting
        return avg_vector

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, model_name):
        super().__init__() 
        self.transformer_model = AutoModel.from_pretrained(model_name, hidden_dropout_prob=0.2)
        self.lstm = nn.LSTM(input_size = 1024, hidden_size = 1024, num_layers = 3, dropout=0.5, bidirectional = True, batch_first = True)
        self.dense_layer = nn.Linear(2048, 42, bias=True)
        
    
    def forward(self, input_ids, attention_mask):
        encode_layers = self.transformer_model(input_ids=input_ids, attention_mask = attention_mask)[0]
        enc_hiddens, (last_hidden, last_cell) = self.lstm(encode_layers)
        output_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim = 1)

        output = self.dense_layer(output_hidden)

        return output

### Define utils

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

### Training

In [None]:
for fold in CFG.FOLD :
    train_idx, valid_idx = folds[fold]['train_idx'], folds[fold]['valid_idx']
    train_subset = Subset(train_dataset, train_idx)
    valid_subset = Subset(valid_dataset, valid_idx)
    train_loader = DataLoader(train_subset, batch_size=CFG.batch_size, shuffle=True, num_workers=5)
    valid_loader = DataLoader(valid_subset, batch_size=CFG.batch_size, shuffle=True, num_workers=5)
    
    model = RBERT_RobertaForSequenceClassification(CFG.MODEL_NAME, num_classes=42, dr_rate=CFG.classifier_dropout)
    model.to(device)
    optimizer = AdamP(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2250, eta_min=1e-6)
    loss_fn = FocalLoss(gamma=0.5)
    
    train_loss, train_acc = AverageMeter(), AverageMeter()
    valid_loss, valid_acc = AverageMeter(), AverageMeter()
    best_val_acc = 0
    steps = 0
    
    run = wandb.init(project='P2', group=CFG.MODEL_NAME, name=CFG.name, tags=CFG.tag, config=CFG)
    t = time.time()
    for epoch in range(CFG.epochs) :
        for item in train_loader :
            input_ids = item['input_ids'].to(device)
            attention_mask = item['attention_mask'].to(device)
            e1_mask = item['e1_mask'].to(device)
            e2_mask = item['e2_mask'].to(device)
            label = item['labels'].to(device)

            model.train()
            logit = model(input_ids, attention_mask, e1_mask, e2_mask)
            loss = loss_fn(logit, label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            pred = logit.argmax(-1)
            acc = (pred == label).sum().float() / input_ids.size(0)
            train_loss.update(loss.item(), input_ids.size(0))
            train_acc.update(acc, input_ids.size(0))
            wandb.log({'Step' : steps})

            steps += 1
            if steps % 100 == 0 :
                for item in valid_loader :
                    input_ids = item['input_ids'].to(device)
                    attention_mask = item['attention_mask'].to(device)
                    e1_mask = item['e1_mask'].to(device)
                    e2_mask = item['e2_mask'].to(device)
                    label = item['labels'].to(device)
                    
                    model.eval()
                    logit = model(input_ids, attention_mask, e1_mask, e2_mask)
                    loss = loss_fn(logit, label)
                    
                    pred = logit.argmax(-1)
                    acc = (pred == label).sum().float() / input_ids.size(0)
                    valid_loss.update(loss.item(), input_ids.size(0))
                    valid_acc.update(acc, input_ids.size(0))
                
                print(f'steps:{steps}\t| valid_acc:{valid_acc.avg:.4}\t| valid_loss:{valid_loss.avg:.4}\t| train_acc:{train_acc.avg:.4}\t| train_loss:{train_loss.avg:.4}\t| time:{time.time()-t}')
                wandb.log({
                    "eval/accuracy": valid_acc.avg,
                    "eval/loss": valid_loss.avg,
                    "train/loss": train_loss.avg,
                    "train/learning_rate": scheduler.get_last_lr()[0]
                })
                      
                if valid_acc.avg > best_val_acc :
                    best_val_acc = valid_acc.avg
                    for f in glob.glob(f'custom_data/{CFG.name}/{fold}_*{CFG.name}.pth') :
                        open(f, 'w').close()
                        os.remove(f)
                    torch.save(model.state_dict(), os.path.join(ROOT_PATH, 'custom_data', CFG.name, f'{fold}_{steps}_{best_val_acc:.4}_{CFG.name}.pth'))
                      
                valid_acc.reset()
                valid_loss.reset()
                train_acc.reset()
                train_loss.reset()
                t = time.time()

steps:100	| valid_acc:0.5694	| valid_loss:1.631	| train_acc:0.51	| train_loss:2.043	| time:104.67896699905396
steps:200	| valid_acc:0.5989	| valid_loss:1.366	| train_acc:0.526	| train_loss:1.76	| time:104.23421621322632
steps:300	| valid_acc:0.6156	| valid_loss:1.251	| train_acc:0.55	| train_loss:1.537	| time:104.51554346084595
steps:400	| valid_acc:0.6161	| valid_loss:1.139	| train_acc:0.578	| train_loss:1.355	| time:104.36885523796082
steps:500	| valid_acc:0.6206	| valid_loss:1.132	| train_acc:0.612	| train_loss:1.242	| time:104.8938398361206
steps:600	| valid_acc:0.6467	| valid_loss:1.019	| train_acc:0.637	| train_loss:1.064	| time:104.8907961845398
steps:700	| valid_acc:0.6472	| valid_loss:0.9621	| train_acc:0.608	| train_loss:1.222	| time:104.79438209533691
steps:800	| valid_acc:0.6328	| valid_loss:1.005	| train_acc:0.628	| train_loss:1.051	| time:105.23859357833862
steps:900	| valid_acc:0.6711	| valid_loss:0.8921	| train_acc:0.626	| train_loss:1.027	| time:104.58516383171082
step

KeyboardInterrupt: ignored

### Inference and Ensemble

In [None]:
class TestDataset(Dataset) :
    def __init__(self, file_dir, tokenizer) :
        self.dataset = pd.read_csv(file_dir, delimiter='\t', header=None)
        self.sentence = self.dataset[1]
        self.entity_01 = self.dataset[2]
        self.entity_02 = self.dataset[5]
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx) :
        sentence = self.sentence[idx]
        e1 = self.entity_01[idx]
        e2 = self.entity_02[idx]
        e1_mask, e2_mask = self._get_ent_mask(e1, e2)
        item = tokenizer(e1+' RELATION '+e2, sentence, max_length=CFG.tokenizer_max_length, 
                         padding='max_length', truncation=True, return_tensors='pt')
        item['input_ids'] = item['input_ids'].squeeze(0)
        item['attention_mask'] = item['attention_mask'].squeeze(0)
        item['e1_mask'] = torch.Tensor(e1_mask)
        item['e2_mask'] = torch.Tensor(e2_mask)
        return item
    
    def _get_ent_mask(self, e1, e2) :
        e1_mask = np.zeros(CFG.tokenizer_max_length, dtype=int)
        e2_mask = np.zeros(CFG.tokenizer_max_length, dtype=int)
        e1_len = len(self.tokenizer.encode(e1, add_special_tokens=False))
        e2_len = len(self.tokenizer.encode(e2, add_special_tokens=False))
        e1_mask[1 : 1+e1_len] = 1
        e2_mask[3+e1_len : 3+e1_len+e2_len] = 1
        return e1_mask, e2_mask

In [None]:
test_dataset = TestDataset("/opt/ml/input/data/test/test.tsv", tokenizer)
test_loader = DataLoader(test_dataset, 10, shuffle=False)

probs_lst = []
for best_model in glob.glob(f'custom_data/{CFG.name}/*{CFG.name}.pth') :
    model = RBERT_RobertaForSequenceClassification(CFG.MODEL_NAME, 42)
    model.load_state_dict(torch.load(best_model))
    model.to(device)
    
    output_probs = []
    for item in test_loader :
        input_ids = item['input_ids'].to(device)
        attention_mask = item['attention_mask'].to(device)
        e1_mask = item['e1_mask'].to(device)
        e2_mask = item['e2_mask'].to(device)
        
        model.eval()
        logit = model(input_ids, attention_mask, e1_mask, e2_mask)
        output_probs.extend(logit.cpu().detach().numpy())
    
    output_probs = torch.nn.functional.softmax(torch.Tensor(output_probs), dim=1)
    probs_lst.append(np.array(output_probs)[...,np.newaxis])

models_prob = np.mean(np.concatenate(probs_lst, axis=2), axis=2)
np.save(os.path.join(ROOT_PATH, 'custom_data', CFG.name, f'Probs_{CFG.name}.npy'), models_prob)

models_pred = np.argmax(models_prob, axis=1)
output = pd.DataFrame(models_pred, columns=['pred'])
output.to_csv(os.path.join(ROOT_PATH, 'custom_data', CFG.name, f'Basic_{CFG.name}.csv'), index=False)

models_prob += np.array([0.1] + [0]*41)
models_pred = np.argmax(models_prob, axis=1)
output = pd.DataFrame(models_pred, columns=['pred'])
output.to_csv(os.path.join(ROOT_PATH, 'custom_data', CFG.name, f'AddWeight_{CFG.name}.csv'), index=False)