# Define paths

In [3]:
""" define train data and test data path """
import os
from glob import glob

# root path
ROOT_PATH = os.path.abspath(".")

# directory paths
ROOT_DATA_DIR = os.path.join(os.path.dirname(ROOT_PATH), 'dataset')
TRAIN_DATA_DIR = os.path.join(ROOT_DATA_DIR, 'train')
TEST_DATA_DIR = os.path.join(ROOT_DATA_DIR, 'test')
PRIDICTION_DIR = os.path.join(os.path.dirname(ROOT_PATH), 'prediction')
BASELINE_DIR = os.path.join(os.path.dirname(ROOT_PATH), 'code')

# files paths
TRAIN_FILE_PATH = glob(os.path.join(TRAIN_DATA_DIR, '*'))[0]
TEST_FILE_PATH = glob(os.path.join(TEST_DATA_DIR, '*'))[0]
SAMPLE_SUBMISSION_PATH = os.path.join(PRIDICTION_DIR, 'sample_submission.csv')
PORORO_TRAIN_PATH = os.path.join(TRAIN_DATA_DIR, 'pororo_train_typed_entity_marker_punct.csv')
PORORO_TEST_PATH = os.path.join(TEST_DATA_DIR, 'pororo_test_typed_entity_marker_punct.csv')

print(TRAIN_FILE_PATH, TEST_FILE_PATH,SAMPLE_SUBMISSION_PATH, PORORO_TRAIN_PATH, PORORO_TEST_PATH)

IndexError: list index out of range

# set config

In [2]:
""" Set configuration as dictionary format """

import wandb
from datetime import datetime
from easydict import EasyDict

# login wandb and get today's date until hour and minute
wandb.login()
today = datetime.now().strftime("%m%d_%H:%M")

# Debug set to true in order to debug high-layer code.
# CFG Configuration
CFG = wandb.config # wandb.config provides functionality of easydict.EasyDict
CFG.DEBUG = False

# Dataset Config as constants
CFG.num_labels = 30
CFG.num_workers = 4
CFG.split_ratio = 0 # not going to use validation/test set
CFG.batch_size = 32

# Train configuration
CFG.user_name = "kyeonj"
CFG.file_base_name = f"{CFG.user_name}_{today}"
# CFG.model_name = "klue/roberta-base" # https://huggingface.co/klue/roberta-base
# CFG.model_name = "monologg/koelectra-base-v3-discriminator" # https://huggingface.co/monologg/koelectra-base-v3-discriminator
# CFG.model_name = "klue/roberta-large"
CFG.model_name = "klue/roberta-large"
CFG.num_folds = 5 # 5 Fold as default
CFG.num_epochs = 3 # 
CFG.max_token_length = 128+10 # refer to EDA where Q3 is 119, there are only 460 sentences out of 32k train set
CFG.stopwords = []
CFG.learning_rate = 5e-5
CFG.weight_decay = 1e-2 # https://paperswithcode.com/method/weight-decay
CFG.input_size = 1024
CFG.output_size = 1024
CFG.num_rnn_layers = 3
CFG.dropout_rate = 0.0

# training steps configurations
CFG.save_steps = 500
CFG.early_stopping_patience = 5
CFG.warmup_steps = 500
CFG.logging_steps = 100
CFG.evaluation_strategy = 'epoch'
CFG.evaluation_steps = 500

# Directory configuration
CFG.result_dir = os.path.join(os.path.dirname(ROOT_PATH), "results")
CFG.saved_model_dir = os.path.join(os.path.dirname(ROOT_PATH), "best_models")
CFG.logging_dir = os.path.join(os.path.dirname(ROOT_PATH), "logs")
CFG.baseline_dir = os.path.join(os.path.dirname(ROOT_PATH), 'code')

# file configuration
CFG.result_file = os.path.join(CFG.result_dir, f"{CFG.file_base_name}.csv")
CFG.saved_model_file = os.path.join(CFG.saved_model_dir, f"{CFG.file_base_name}.pth")
CFG.logging_file = os.path.join(CFG.logging_dir, f"{CFG.file_base_name}.log")
CFG.label_to_num_file = os.path.join(CFG.baseline_dir, 'dict_label_to_num.pkl')
CFG.num_to_label_file = os.path.join(CFG.baseline_dir, "dict_num_to_label.pkl")
CFG.train_file_path = TRAIN_FILE_PATH
CFG.test_file_path = TEST_FILE_PATH
CFG.sample_submission_file_path = SAMPLE_SUBMISSION_PATH

# Other configurations
CFG.load_best_model_at_end = True

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkyeonj[0m (use `wandb login --relogin` to force relogin)


In [4]:
eng_to_kor = {
    'quantity' : 'Î¨ºÎüâ'
    ,'person' : 'ÏÇ¨Îûå'
    ,'term' : 'ÏûÑÍ∏∞'
    ,'o' : 'ÏóÜÏùå'
    ,'event' : 'Ïù¥Î≤§Ìä∏'
    ,'study_field':'Ïä§ÌÑ∞ÎîîÌïÑÎìú'
    ,'material':'Ïû¨Î£å'
    ,'city':'ÎèÑÏãú'
    ,'time':'ÏãúÍ∞Ñ'
    ,'animal':'ÎèôÎ¨º'
    ,'location':'ÏúÑÏπò'
    ,'disease': 'ÏßàÎ≥ë'
    ,'civilization':'Î¨∏Î™Ö'
    ,'occupation':'ÏßÅÏóÖ'
    ,'organization':'Ï°∞ÏßÅ'
    ,'country':'ÎÇòÎùº'
    ,'artifact':'Ïú†Î¨º'
    ,'date':'ÎÇ†Ïßú'
    ,'plant':'ÏãùÎ¨º'
    ,'theory':'Ïù¥Î°†'
}

CFG.special_token_list = []
for v in eng_to_kor.values():
    CFG.special_token_list.append("*"+v+"*")
    CFG.special_token_list.append("^"+v+"^")   

# import

In [3]:
from torch.utils.data import Dataset
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer
from sklearn.model_selection import StratifiedKFold

In [4]:
import random

def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

# data

In [26]:
import pickle as pickle
import pandas as pd
import torch

def pull_out_dictionary(df_input: pd.DataFrame):
  """ pull out str `{}` values from the pandas dataframe and shape it as a new column"""

  df = df_input.copy()

  # assign subject_entity and object_entity column values type as dictionary
  df['subject_entity'] = df['subject_entity'].apply(lambda x: eval(x))
  df['object_entity'] = df['object_entity'].apply(lambda x: eval(x))

  # parse item inside of subject_entity and object_entity's dictionary values as columns of dataframe
  # word, start_idx, end_idx, type as new columns 
  df = df.assign(
      # subject_entity
      subject_word=lambda x: x['subject_entity'].apply(lambda x: x['word']),
      subject_start_idx=lambda x: x['subject_entity'].apply(lambda x: x['start_idx']),
      subject_end_idx=lambda x: x['subject_entity'].apply(lambda x: x['end_idx']),
      subject_type=lambda x: x['subject_entity'].apply(lambda x: x['type']),
      
      # object_entity
      object_word=lambda x: x['object_entity'].apply(lambda x: x['word']),
      object_start_idx=lambda x: x['object_entity'].apply(lambda x: x['start_idx']),
      object_end_idx=lambda x: x['object_entity'].apply(lambda x: x['end_idx']),
      object_type=lambda x: x['object_entity'].apply(lambda x: x['type']),
  )

  # drop subject_entity and object_entity column
  df = df.drop(['subject_entity', 'object_entity'], axis=1)

  return df

class RE_Dataset(torch.utils.data.Dataset):
  """ Dataset Íµ¨ÏÑ±ÏùÑ ÏúÑÌïú class."""
  def __init__(self, pair_dataset, labels):
    self.pair_dataset = pair_dataset
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

def preprocessing_dataset(dataset):
  """ Ï≤òÏùå Î∂àÎü¨Ïò® csv ÌååÏùºÏùÑ ÏõêÌïòÎäî ÌòïÌÉúÏùò DataFrameÏúºÎ°ú Î≥ÄÍ≤Ω ÏãúÏºúÏ§çÎãàÎã§."""
  
  dataset = pull_out_dictionary(dataset)

  # rename columns subject_word as subject_entity, object_word as object_entity
  dataset = dataset.rename(columns={'subject_word': 'subject_entity', 'object_word': 'object_entity'})

  out_dataset = pd.DataFrame({'id':dataset['id'], 'sentence':dataset['sentence'],'subject_entity':dataset['subject_entity'],'object_entity':dataset['object_entity'],'label':dataset['label'],})
  display(out_dataset.head(2))
  return out_dataset

def load_data(dataset_dir):
  """ csv ÌååÏùºÏùÑ Í≤ΩÎ°úÏóê Îß°Í≤å Î∂àÎü¨ ÏòµÎãàÎã§. """
  pd_dataset = pd.read_csv(dataset_dir)
  dataset = preprocessing_dataset(pd_dataset)
  
  return dataset

def tokenized_dataset(dataset, tokenizer):
  """ tokenizerÏóê Îî∞Îùº sentenceÎ•º tokenizing Ìï©ÎãàÎã§."""
  concat_entity = []
  for e01, e02 in zip(dataset['subject_entity'], dataset['object_entity']):
    temp = ''
    temp = e01 + tokenizer.sep_token + e02
    concat_entity.append(temp)
  tokenized_sentences = tokenizer(
      concat_entity,
      list(dataset['sentence']),
      return_tensors="pt",
      # padding= 'max_length',
      padding= True,
      truncation=True,
      max_length=CFG.max_token_length,
      add_special_tokens=True,
      return_token_type_ids=False,
      )

  # tokenized_sentences = tokenizer(
  #         concat_entity,
  #         list(dataset["sentence"]),
  #         return_tensors="pt",
  #         padding=True,
  #         truncation=True,
  #         max_length=256,
  #         add_special_tokens=True,
  #         # return_token_type_ids=False,
  #     )
  return tokenized_sentences


# score

In [6]:
import sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def klue_re_micro_f1(preds, labels):
    """KLUE-RE micro f1 (except no_relation)"""
    label_list = [
        "no_relation",
        "org:top_members/employees",
        "org:members",
        "org:product",
        "per:title",
        "org:alternate_names",
        "per:employee_of",
        "org:place_of_headquarters",
        "per:product",
        "org:number_of_employees/members",
        "per:children",
        "per:place_of_residence",
        "per:alternate_names",
        "per:other_family",
        "per:colleagues",
        "per:origin",
        "per:siblings",
        "per:spouse",
        "org:founded",
        "org:political/religious_affiliation",
        "org:member_of",
        "per:parents",
        "org:dissolved",
        "per:schools_attended",
        "per:date_of_death",
        "per:date_of_birth",
        "per:place_of_birth",
        "per:place_of_death",
        "org:founded_by",
        "per:religion",
    ]
    no_relation_label_idx = label_list.index("no_relation")
    label_indices = list(range(len(label_list)))
    label_indices.remove(no_relation_label_idx)
    # print(f'####### F1 {len(labels)} {len(preds)} {len(label_indices)}')
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0


def klue_re_auprc(probs, labels):
    """KLUE-RE AUPRC (with no_relation)"""
    labels = np.eye(30)[labels]

    score = np.zeros((30,))
    for c in range(30):
        targets_c = labels.take([c], axis=1).ravel()
        preds_c = probs.take([c], axis=1).ravel()
        precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(recall, precision)
    return np.average(score) * 100.0


def compute_metrics(pred):
    """validationÏùÑ ÏúÑÌïú metrics function"""
    # print(f'############## COMPUTE METRICS {len(pred.label_ids)} {len(pred.predictions.argmax(-1))} {len(pred.predictions)}')
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = pred.predictions

    # calculate accuracy using sklearn's function
    f1 = klue_re_micro_f1(preds, labels)
    auprc = klue_re_auprc(probs, labels)
    # acc = accuracy_score(labels, preds)  # Î¶¨ÎçîÎ≥¥Îìú ÌèâÍ∞ÄÏóêÎäî Ìè¨Ìï®ÎêòÏßÄ ÏïäÏäµÎãàÎã§.

    return {
        "micro f1 score": f1,
        "auprc": auprc,
        # "accuracy": acc,
    }


def label_to_num(label):
    num_label = []
    with open("dict_label_to_num.pkl", "rb") as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])

    return num_label

# loss

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

# https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

# custom model

In [15]:
from torch import nn
from transformers import AutoTokenizer, AutoModel, AutoConfig
""" Define Custom Model -> will later allocated to models.py """

class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model_name = CFG.model_name
        self.num_labels = CFG.num_labels
        self.input_size = CFG.input_size
        self.output_size = CFG.output_size
        self.num_rnn_layers = 3
        self.dropout_rate = 0.1
        self.is_train = True
        self.backbone_model = AutoModel.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
        # special_tokens_dict = {
        #     'additional_special_tokens': CFG.special_token_list
        # }
        # num_added_tokens = self.tokenizer.add_special_tokens(special_tokens_dict)
        # self.backbone_model.resize_token_embeddings(len(self.tokenizer))
        # add bidrectional gru (multiple) layers in the end

        self.lstm = nn.LSTM(
            # set as BERT model's hidden size, not as an integer: flexible for different models
            input_size=1024,
            hidden_size=self.output_size,
            bidirectional=True,
            batch_first=True,
            num_layers=self.num_rnn_layers,
            dropout=self.dropout_rate
            )

        # classifierÏùÄ Î∞îÍæ∏ÏßÄ ÏïäÍ≥†
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(2*self.output_size,self.output_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(p=self.dropout_rate, inplace=False),
            torch.nn.Linear(self.output_size,self.num_labels)
        )

    def forward(self, input_ids, attention_mask):
        backbone_output = self.backbone_model(input_ids=input_ids, attention_mask=attention_mask)

        # add lstm layer
        _, (hn,cn) = self.lstm(backbone_output[0]) # torch.Size([batch, 132, 1024]) -> hn : torch.Size([6, batch, 768])
        # hn[0] -> left to right, hn[1] -> right to left
        
        # b_n=hn.size()[1] # batch size
        # hn=torch.transpose(hn, 0, 1).reshape(b_n, -1)   # torch.Size([batch, 6*768])
        # hn=torch.transpose(hn, 1, 0).reshape(b_n, -1)   # torch.Size([batch, 6*768])

        # input as fully connected layers
        output_lstm = torch.cat((hn[0], hn[1]), dim=-1) # torch.Size([batch, 2*768])
        output = self.classifier(output_lstm)    #  torch.Size([batch, 30])
        # print(f'output : {output}')
        return output

In [11]:
model = CustomModel()

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [None]:
x = torch.randn(6, 32, 768)
x=torch.transpose(x, 32, 0).reshape(32, -1)
x.shape

torch.Size([1, 4608])

In [14]:
# from torchsummary import summary
# summary(model,[(32,132)], [(32,132)])

# train

In [27]:
training_args = TrainingArguments(
    report_to = 'wandb',              
    output_dir= CFG.result_dir,          # output directory
    save_total_limit=5,              # number of total save model.
    save_steps=CFG.save_steps,       # model saving step.
    num_train_epochs=CFG.num_epochs,              # total number of training epochs
    learning_rate=CFG.learning_rate,               # learning_rate
    weight_decay=CFG.weight_decay,
    logging_dir= CFG.logging_dir, 
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    logging_steps=CFG.evaluation_steps,              # log saving step.
    evaluation_strategy='steps', # evaluation strategy to adopt during training
                                # `no`: No evaluation during training.
                                # `steps`: Evaluate every `eval_steps`.
                                # `epoch`: Evaluate every end of epoch.
    eval_steps = CFG.evaluation_steps,            # evaluation step.
    load_best_model_at_end = True,
    group_by_length = True, # dynamic padding
    warmup_steps=300,
    dataloader_num_workers = CFG.num_workers,
    metric_for_best_model = 'f1',
    run_name = 'add_lstm',
)

PyTorch: setting up devices


In [28]:
def makedirs(path) :
    try :
        os.makedirs(path)
    except OSError :
        if not os.path.isdir(path) :
            raise

In [29]:
device = torch.device('cuda:0' if torch.cuda.is_available() and CFG.DEBUG == False else 'cpu')
print(device)

cuda:0


In [30]:
# model = CustomModel()

In [31]:
from transformers import Trainer

loss_fn = FocalLoss(gamma=0.5)
# loss_fn = nn.CrossEntropyLoss()

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False) :
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        loss = loss_fn(outputs, labels)
        return (loss, outputs) if return_outputs else loss


In [33]:
# dataset = pd.read_csv('/opt/ml/dataset/train/train_typed_entity_marker_punct.csv')
dataset = pd.read_csv(PORORO_TRAIN_PATH)


In [None]:
dataset.info()
len(dataset)

In [34]:
models = []
dataset = dataset[:5001]
stf = StratifiedKFold(n_splits = 5)
for fold, (train_idx, dev_idx) in enumerate(stf.split(dataset, list(dataset['label']))) :
    print('Fold {}'.format(fold + 1))

    model = CustomModel()
    model.to(device)

    # Ï∂îÍ∞ÄÌïú token Í∞úÏàòÎßåÌÅº token embedding size ÎäòÎ†§Ï£ºÍ∏∞
    # model.resize_token_embeddings(model.tokenizer.vocab_size + added_token_num)

    train_dataset = dataset.iloc[train_idx]
    dev_dataset = dataset.iloc[dev_idx]

    train_label = label_to_num(train_dataset['label'].values)
    dev_label = label_to_num(dev_dataset['label'].values)

    tokenized_train = tokenized_dataset(train_dataset, model.tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, model.tokenizer)

    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    trainer = MyTrainer(
        model=model,                         # the instantiated ü§ó Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=RE_train_dataset,         # training dataset
        eval_dataset=RE_dev_dataset,             # evaluation dataset
        compute_metrics=compute_metrics,         # define metrics function
    )
   
    trainer.train()
    
    # save model
    makedirs(f'./best_model/lstm/fold_{fold}/')
    model.save_pretrained(f'./best_model/lstm/fold_{fold}/')
    
    # Prevent OOM error
    model.cpu()
    del model
    torch.cuda.empty_cache()

    

Fold 1


loading configuration file https://huggingface.co/klue/roberta-large/resolve/main/config.json from cache at /opt/ml/.cache/huggingface/transformers/571e05a2160c18c93365862223c4dae92bbd1b41464a4bd5f372ad703dba6097.ae5b7f8d8a28a3ff0b1560b4d08c6c3bd80f627288eee2024e02959dd60380d0
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.10.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

loading wei

Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 6494
  Batch size = 32


ValueError: Found input variables with inconsistent numbers of samples: [6494, 6291]

In [29]:
models = []
stf = StratifiedKFold(n_splits = 5)
for fold, (train_idx, dev_idx) in enumerate(stf.split(dataset, list(dataset['label']))) :
    print('Fold {}'.format(fold + 1))

    model = CustomModel()
    model.to(device)

    # Ï∂îÍ∞ÄÌïú token Í∞úÏàòÎßåÌÅº token embedding size ÎäòÎ†§Ï£ºÍ∏∞
    # model.resize_token_embeddings(model.tokenizer.vocab_size + added_token_num)

    train_dataset = dataset.iloc[train_idx]
    dev_dataset = dataset.iloc[dev_idx]

    train_label = label_to_num(train_dataset['label'].values)
    dev_label = label_to_num(dev_dataset['label'].values)

    tokenized_train = tokenized_dataset(train_dataset, model.tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, model.tokenizer)

    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    trainer = MyTrainer(
        model=model,                         # the instantiated ü§ó Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=RE_train_dataset,         # training dataset
        eval_dataset=RE_dev_dataset,             # evaluation dataset
        compute_metrics=compute_metrics,         # define metrics function
    )
   
    trainer.train()
    
    # save model
    makedirs(f'./best_model/lstm/fold_{fold}/')
    model.save_pretrained(f'./best_model/lstm/fold_{fold}/')
    
    # Prevent OOM error
    model.cpu()
    del model
    torch.cuda.empty_cache()

    

Fold 1


loading configuration file https://huggingface.co/klue/roberta-large/resolve/main/config.json from cache at /opt/ml/.cache/huggingface/transformers/571e05a2160c18c93365862223c4dae92bbd1b41464a4bd5f372ad703dba6097.ae5b7f8d8a28a3ff0b1560b4d08c6c3bd80f627288eee2024e02959dd60380d0
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.10.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

loading wei

Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 6494
  Batch size = 1


ValueError: Found input variables with inconsistent numbers of samples: [6494, 0]