In [1]:
import os
import random
import numpy as np
from tqdm import tqdm
import torch
import logging
from sklearn.metrics import accuracy_score

from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers.trainer_pt_utils import _get_learning_rate
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

from transformers import RobertaTokenizer
from transformers import RobertaForMaskedLM
from transformers import RobertaConfig

# os.environ["TOKENIZERS_PARALLELISM"] = "true"

## Seed 고정

In [2]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # current gpu seed
    torch.cuda.manual_seed_all(seed) # All gpu seed
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False  # True로 하면 gpu에 적합한 알고리즘을 선택함.

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Config

In [3]:
MODEL_NAME = 'FacebookAI/roberta-base'

In [4]:
config = AutoConfig.from_pretrained(MODEL_NAME)  

print(config)

RobertaConfig {
  "_name_or_path": "FacebookAI/roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # 토크나이저는 학습되어 있는 것으로, BERTTOKENIZE

In [6]:
tokenizer

RobertaTokenizerFast(name_or_path='FacebookAI/roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [7]:
tokenizer("Hello I'm World!"), tokenizer.decode(tokenizer.encode("Hello I'm World!"))

({'input_ids': [0, 31414, 38, 437, 623, 328, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]},
 "<s>Hello I'm World!</s>")

In [8]:
tokenizer.model_max_length

512

In [9]:
model = RobertaForMaskedLM.from_pretrained(MODEL_NAME, config=config)

print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [10]:
print(model.num_parameters())

124697433


In [11]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                                       Param #
RobertaForMaskedLM                                           --
├─RobertaModel: 1-1                                          --
│    └─RobertaEmbeddings: 2-1                                --
│    │    └─Embedding: 3-1                                   38,603,520
│    │    └─Embedding: 3-2                                   394,752
│    │    └─Embedding: 3-3                                   768
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─RobertaEncoder: 2-2                                   --
│    │    └─ModuleList: 3-6                                  85,054,464
├─RobertaLMHead: 1-2                                         --
│    └─Linear: 2-3                                           590,592
│    └─LayerNorm: 2-4                                        1,536
│    └─Linear: 2-5                                           38,65

## 데이터 준비

In [12]:
import pandas as pd

# pandora = pd.read_csv("/home/user/10TB/Data/PANDORA/Sampling/pandora_randset1(10).csv", index_col = 0)

pandora = pd.read_csv("/mnt/HDD8TB/Data/Pandora/CLIP_sim_pandora.csv", index_col=0)

In [14]:
class RoBERTa_Dataset(torch.utils.data.Dataset):
    def __init__(self, data:list, tokenizer):  # 전처리된 데이터 셋이 들어옴
        self.data = data
        self.tokenizer = tokenizer
        # self.label = label

    def __getitem__(self, idx):
        # gradient 계산에 영향을 주지 않게 clone().detach() 실행
        
        text = self.data[idx]
        tokens = self.tokenizer(text, 
                              #  return_tensors="pt",  # pytorch.Tensor로 리턴
                                max_length=512, 
                                padding="max_length",  
                                truncation=True,  # max_length 넘어가면 버림)
                               )
        
        return tokens

    def __len__(self):  # 샘플 수
        return len(self.data)

In [15]:
tokenized_whole_dataset = RoBERTa_Dataset(pandora['body'].to_list(), tokenizer)

In [16]:
tokenized_whole_dataset.__getitem__(970)

{'input_ids': [0, 5975, 167, 32, 1256, 3280, 782, 98, 14, 18, 45, 98, 24941, 15467, 114, 402, 723, 62, 5, 24393, 9, 782, 21, 576, 66, 13, 481, 101, 1265, 3130, 1265, 74, 28, 269, 5373, 235, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [17]:
print(tokenized_whole_dataset.__len__())
print(tokenized_whole_dataset.__getitem__(970))
print(tokenizer.decode(tokenized_whole_dataset.__getitem__(970)['input_ids']))

1490000
{'input_ids': [0, 5975, 167, 32, 1256, 3280, 782, 98, 14, 18, 45, 98, 24941, 15467, 114, 402, 723, 62, 5, 24393, 9, 782, 21, 576, 66, 13, 481, 101, 1265, 3130, 1265, 74, 28, 269, 5373, 235, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Data collator

In [18]:
from transformers import DataCollatorForWholeWordMask, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15, return_tensors='pt'
)

In [19]:
print(tokenized_whole_dataset.__getitem__(970))

col_test = data_collator.torch_call([tokenized_whole_dataset.__getitem__(970)])
print(col_test)

tokenizer.decode(
    col_test['input_ids'].tolist()[0])

{'input_ids': [0, 5975, 167, 32, 1256, 3280, 782, 98, 14, 18, 45, 98, 24941, 15467, 114, 402, 723, 62, 5, 24393, 9, 782, 21, 576, 66, 13, 481, 101, 1265, 3130, 1265, 74, 28, 269, 5373, 235, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

"<s>Now those are pretty basic needs so that's not<mask> unreasonable Imagine if something higher up the hierarchy of needs was given<mask> for free like education Free education would be really crazy right?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [20]:
class CustomCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if control.should_log:
            print("Logging")
            control_copy = copy.deepcopy(control)
            print(f"Step {state.global_step}: training accuracy = {self._trainer.progress_bar.average['acc']}")
            #self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [21]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.total_element_rfp = 0
        self.total_correct_rfp = 0
        self.count = 0
        self.others = 0
        
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)

        # code for calculating accuracy
        if "labels" in inputs:
            preds = outputs.logits.detach().cpu()
            input_label = inputs['labels'].detach().cpu()
            
            """
            print("-------input-------")
            print(inputs['input_ids'].shape)
            print(inputs['labels'].shape)
            print(labels)
          #  print(labels.shape)
            
            print("-------preds-------")
            print(preds)
            print(preds.shape)
            print(preds.argmax(axis=-1))
            print(preds.argmax(axis=-1).shape)
            print(type(preds.argmax(axis=-1)))
            print(preds.argmax(axis=-1).reshape(-1))
            print(preds.argmax(axis=-1).reshape(-1).shape)
            
            print("-------ex-------")
            print(inputs['labels'])
            print(inputs['labels'].view(-1).shape)
            print(inputs['labels'].shape)
            print(inputs['labels'].view(-1,))
            """
            
            correct_rfp = preds.argmax(dim=-1).eq(input_label)
            correct_rfp_filter = input_label.ne(-100)  # 0 = self.vocab.pad_index
            correct_rfp = correct_rfp[correct_rfp_filter].sum().item()
            self.total_correct_rfp += correct_rfp
            self.total_element_rfp += correct_rfp_filter.sum().item()
            acc = self.total_correct_rfp / self.total_element_rfp * 100
            
            #if self.state.global_step % (self.state.max_steps // self.state.num_train_epochs) ==0:
            if (self.state.global_step % 50000) == 0 or self.state.global_step == self.state.max_steps:
                print("Global Step: ", self.state.global_step)
                print("Max Steps: ", self.state.max_steps)
                print("Num Train Epochs: ", self.state.num_train_epochs)
                print("acc: ", acc)
                print("LR: ", self._get_learning_rate())
                print("================================")
                print("# total correct:", self.total_correct_rfp)
                print("# total element:", self.total_element_rfp)
                self.count = 0
                self.total_element_rfp = 0
                self.total_correct_rfp = 0
            
            
        # end code for calculating accuracy
                    
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

In [22]:
training_args = TrainingArguments(
    output_dir="/mnt/HDD8TB/PersonalityAI/RoBERTa_pandora_align_MLMmodel",
    logging_dir= "/mnt/HDD8TB/PersonalityAI/RoBERTa_pandora_align_MLMlog",
    num_train_epochs=20,
    learning_rate = 1e-4,
   # max_steps=1000,
    per_device_train_batch_size=16,
#    gradient_accumulation_steps = 16,
#    per_device_eval_batch_size = 16,
#    eval_accumulation_steps = 32,
    logging_strategy = "steps",
    save_strategy = "steps",
    lr_scheduler_type = "linear",
#    dataloader_num_workers = 16,
    warmup_steps = 10000,
    weight_decay=0.01,
#    warmup_steps = 1643
#    evaluation_strategy = "steps", # need a eval_dataset
#    eval_steps = 10,
    save_steps=50000,
    logging_steps=50000,
#    save_total_limit=10,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_whole_dataset,
)

In [23]:
#trainer.add_callback(CustomCallback(trainer))
trainer.train("/mnt/HDD8TB/PersonalityAI/RoBERTa_pandora_align_MLMmodel/checkpoint-700000")

There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


Global Step:  700000
Max Steps:  931260
Num Train Epochs:  20
acc:  52.40963855421686
LR:  2.5102576905542412e-05
# total correct: 87
# total element: 166


Step,Training Loss
750000,2.3837
800000,2.3439
850000,2.3049
900000,2.2712




Global Step:  750000
Max Steps:  931260
Num Train Epochs:  20
acc:  54.735512984291915
LR:  1.967522740594403e-05
# total correct: 4951779
# total element: 9046739




Global Step:  800000
Max Steps:  931260
Num Train Epochs:  20
acc:  55.30915804476807
LR:  1.4247877906345659e-05
# total correct: 5003564
# total element: 9046538




Global Step:  850000
Max Steps:  931260
Num Train Epochs:  20
acc:  55.8467078771502
LR:  8.820528406747283e-06
# total correct: 5052751
# total element: 9047536




Global Step:  900000
Max Steps:  931260
Num Train Epochs:  20
acc:  56.37727528904266
LR:  3.393178907148905e-06
# total correct: 5100659
# total element: 9047367


TrainOutput(global_step=931260, training_loss=0.5751079938067779, metrics={'train_runtime': 529458.0684, 'train_samples_per_second': 56.284, 'train_steps_per_second': 1.759, 'total_flos': 7.8453108062208e+18, 'train_loss': 0.5751079938067779, 'epoch': 20.0})

In [24]:
trainer.save_model("/mnt/HDD8TB/PersonalityAI/RoBERTa_pandora_align_MLMmodel_final")