## storagy check

In [1]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay          99G   59G   35G  63% /
tmpfs            64M     0   64M   0% /dev
tmpfs            89G     0   89G   0% /sys/fs/cgroup
shm             1.0G     0  1.0G   0% /dev/shm
/dev/xvdb1       99G   59G   35G  63% /etc/hosts
tmpfs            89G   12K   89G   1% /proc/driver/nvidia
/dev/xvda1       48G  8.7G   37G  20% /usr/bin/nvidia-smi
udev             89G     0   89G   0% /dev/nvidia0
tmpfs            89G     0   89G   0% /proc/acpi
tmpfs            89G     0   89G   0% /proc/scsi
tmpfs            89G     0   89G   0% /sys/firmware


In [2]:
!du -h --max-depth=1 | sort -hr

13G	.
9.3G	./.cache
2.1G	./model
679M	./kobert
132M	./input
109M	./.vscode-server
5.1M	./.ipynb_checkpoints
3.5M	./.ipython
444K	./.local
84K	./code
32K	./.jupyter
16K	./.config
8.0K	./prediction
8.0K	./.nv
8.0K	./.keras


## Import library

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gluonnlp as nlp
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import time
import random
import pickle
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use('seaborn')

import missingno as msno

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Fix seed

In [5]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(0)

## Use cuda

In [6]:
device = torch.device('cuda:0')
torch.cuda.is_available()

True

## Hyper-parameter

In [7]:
max_len = 128
batch_size = 32
warmup_ratio = 0.01
num_epochs = 4
max_grad_norm = 1
learning_rate =  5e-4
num_folds = 10
PATH = './model/model_state_dict_init'

## Dataset

In [8]:
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'0': dataset[0], 'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset


In [10]:
def load_data(dataset_dir):
  # load label_type, classes
  with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type)
  
  return dataset


In [11]:
def tokenized_dataset(dataset, tokenizer):
    concat_entity = []
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
        temp = ''
        temp = e01 + ' </s> ' + e02
        concat_entity.append(temp)
    tokenized_sentences = tokenizer(
        concat_entity,
        list(dataset['sentence']),
        return_tensors="pt",
        padding=True,
        truncation='only_second',
        max_length=128,
        add_special_tokens=True,
    )
    return tokenized_sentences

In [12]:
df_train = load_data('/opt/ml/input/data/train/train.tsv')

In [13]:
df_train = df_train[~df_train['label'].isin((41, 37, 40, 29))]
len(df_train)

8985

## Train

In [14]:
# accuracy 계산
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [15]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaConfig

MODEL_NAME = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# train, val = train_test_split(df_train, test_size=0.15, random_state=0)

# tokenized_train = tokenized_dataset(train, tokenizer)
# tokenized_val = tokenized_dataset(val, tokenizer)

In [None]:
# train_label = train['label'].values
# val_label = val['label'].values

# train_dataset = RE_Dataset(tokenized_train, train_label)
# val_dataset = RE_Dataset(tokenized_val, val_label)

In [16]:
model_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 42
model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config).to(device)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

In [17]:
# 모델 저장 (fold마다 모델을 불러오기 위해)
torch.save(model.state_dict(), PATH)

In [18]:
t_dataset = tokenized_dataset(df_train, tokenizer)
t_label = df_train['label'].values
dataset = RE_Dataset(t_dataset, t_label)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

kfold = KFold(n_splits=num_folds, random_state=0, shuffle=True)
criterion = nn.CrossEntropyLoss()
results = {}

for fold, (train_ids, val_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print('='*10)
    
    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_ids)
    
    # Define data loaders for training and testing data in this fold
    train_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=train_subsampler
    )
    
    val_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=val_subsampler
    )
    
    # 모델을 불러온다. (huggingface로 계속 불러오면 메모리 초과 발생..)
    model.load_state_dict(torch.load(PATH))
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    
    t_total = len(train_loader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)

    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
    
    best_acc = 0.0
    
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs-1}')
        print('-' * 10)

        train_acc = 0.0
        val_acc = 0.0

        since = time.time()

        #################### Train ####################
        train_loss = 0.0

        model.train()
        for batch_id, batch in enumerate(train_loader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            train_acc += calc_accuracy(outputs[1], labels)
            train_loss = loss.data.cpu().numpy()

        print(f"train Loss: {train_loss:.4f} Acc: {train_acc/(batch_id+1):.4f}")

        #################### Validation ####################
        val_loss =0.0

        model.eval()
        for batch_id, batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_acc += calc_accuracy(outputs[1], labels)
            val_loss = outputs[0].data.cpu().numpy()

        print(f"val Loss: {val_loss:.4f} Acc: {val_acc/(batch_id+1):.4f}")

        #################### model save ####################
        if (val_acc/(batch_id+1)) >= best_acc:
            print(f"epochs_val acc: {val_acc/(batch_id+1):.4f}")
            print(f"epochs_before_best acc: {best_acc:.4f}")
            best_acc = (val_acc/(batch_id+1))
            print(f"epochs_after_best acc: {best_acc:.4f}")
            torch.save(model.state_dict(), f"/opt/ml/model/model_state_dict{fold}.pt")
            
        
        #################### running time check ####################
        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60}m {time_elapsed % 60}s')
        print()
    
    results[fold] = best_acc
    print(f'Best val Acc: {best_acc}')


FOLD 0
Epoch 0/3
----------
train Loss: 1.9572 Acc: 0.4843
val Loss: 1.2585 Acc: 0.4831
epochs_val acc: 0.4831
epochs_before_best acc: 0.0000
epochs_after_best acc: 0.4831
Training complete in 3.0m 41.2884886264801s

Epoch 1/3
----------


In [None]:
# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {num_folds} FOLDS')
print('--------------------------------')
sum = 0.0
for key, value in results.items():
    print(f'Fold {key}: {value} %')
    sum += value

print(f'Average: {sum/len(results.items())} %')

## Inference

In [None]:
df_test = load_data(r"/opt/ml/input/data/test/test.tsv")

In [None]:
test_token_dataset = tokenized_dataset(df_test, tokenizer)
test_label = df_test['label'].values
test_dataset = RE_Dataset(test_token_dataset, test_label)

In [None]:
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    num_workers=5,
    shuffle=False
)

In [None]:
oof_pred = None

for i in range(num_folds):
    model.load_state_dict(torch.load(f"/opt/ml/model/model_state_dict{i}.pt"))
    model.eval()

    all_predictions = []
    for batch_id, (token_ids, valid_length, segment_ids, labels) in enumerate(test_dataloader):
        with torch.no_grad():
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            labels = labels.long().to(device)
            pred = model(token_ids, valid_length, segment_ids)
#             pred = torch.argmax(outputs, dim=-1)
            all_predictions.extend(pred.cpu().numpy())

        fold_pred = np.array(all_predictions)
        
    if oof_pred is None:
        oof_pred = fold_pred / num_folds
    else:
        oof_pred = oof_pred + (fold_pred / num_folds)

In [None]:
submission = np.argmax(oof_pred, axis=1)
submission = pd.DataFrame(submission, columns=['pred'])
submission.to_csv('/opt/ml/prediction/submission.csv', index=False)

In [None]:
from IPython.display import Audio
Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg", autoplay=True)