In [1]:
import os
import random
import numpy as np
from tqdm import tqdm
import torch
import logging
from sklearn.metrics import accuracy_score

from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers import AutoConfig, BigBirdTokenizer, BigBirdForMaskedLM, BigBirdForPreTraining

# os.environ["TOKENIZERS_PARALLELISM"] = "true"

2023-05-10 13:19:53.841525: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-10 13:19:53.956697: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-10 13:19:54.480517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64:
2023-05-10 13:19:54.480574: W tensorflow/compiler/xla/strea

## Seed 고정

In [2]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # current gpu seed
    torch.cuda.manual_seed_all(seed) # All gpu seed
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False  # True로 하면 gpu에 적합한 알고리즘을 선택함.

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Config

In [3]:
MODEL_NAME = 'google/bigbird-roberta-base'

In [4]:
config = AutoConfig.from_pretrained(MODEL_NAME)  

print(config)

BigBirdConfig {
  "_name_or_path": "google/bigbird-roberta-base",
  "architectures": [
    "BigBirdForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rescale_embeddings": false,
  "sep_token_id": 66,
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_bias": true,
  "use_cache": true,
  "vocab_size": 50358
}



In [5]:
tokenizer = BigBirdTokenizer.from_pretrained(MODEL_NAME)  # 토크나이저는 학습되어 있는 것으로, BERTTOKENIZE

In [6]:
tokenizer

PreTrainedTokenizer(name_or_path='google/bigbird-roberta-base', vocab_size=50358, model_max_len=4096, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [7]:
tokenizer("Hello I'm World!"), tokenizer.decode(tokenizer.encode("Hello I'm World!"))

({'input_ids': [65, 18536, 415, 1202, 2260, 101, 66], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]},
 "[CLS] Hello I'm World![SEP]")

In [8]:
tokenizer.model_max_length

4096

In [9]:
model = BigBirdForMaskedLM.from_pretrained(MODEL_NAME)

print(model)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BigBirdForMaskedLM(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0): BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (drop

In [10]:
print(model.num_parameters())

128111286


In [11]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                                            Param #
BigBirdForMaskedLM                                                --
├─BigBirdModel: 1-1                                               --
│    └─BigBirdEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        38,674,944
│    │    └─Embedding: 3-2                                        3,145,728
│    │    └─Embedding: 3-3                                        1,536
│    │    └─LayerNorm: 3-4                                        1,536
│    │    └─Dropout: 3-5                                          --
│    └─BigBirdEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,054,464
│    └─Linear: 2-3                                                590,592
│    └─Tanh: 2-4                                                  --
├─BigBirdOnlyMLMHead: 1-2                                       

## 데이터 준비

In [12]:
import pandas as pd

pandora = pd.read_csv("/home/user/10TB/PANDORA/pandora_preprocessed.csv", index_col = 0)

In [14]:
type(pandora['body'])

pandas.core.series.Series

In [15]:
class bigbird_Dataset(torch.utils.data.Dataset):
    def __init__(self, data:list, tokenizer):  # 전처리된 데이터 셋이 들어옴
        self.data = data
        self.tokenizer = tokenizer
        # self.label = label

    def __getitem__(self, idx):
        # gradient 계산에 영향을 주지 않게 clone().detach() 실행
        
        text = self.data[idx]
        tokens = self.tokenizer(text, 
                              #  return_tensors="pt",  # pytorch.Tensor로 리턴
                                max_length=1024, 
                                padding="max_length",  
                                truncation=True,  # max_length 넘어가면 버림)
                               )
        
        return tokens

    def __len__(self):  # 샘플 수
        return len(self.data)
    
    """
    class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, label):  # 전처리된 데이터 셋이 들어옴
        self.dataset = dataset
        self.label = label

    def __getitem__(self, idx):
        # gradient 계산에 영향을 주지 않게 clone().detach() 실행
        
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):  # 샘플 수
        return len(self.label)
    """

In [16]:
tokenized_whole_dataset = bigbird_Dataset(pandora['body'].to_list(), tokenizer)

In [17]:
tokenized_whole_dataset.__getitem__(970)

{'input_ids': [65, 9499, 9061, 876, 3689, 571, 11911, 988, 415, 867, 911, 446, 922, 2507, 523, 415, 1565, 1908, 387, 456, 618, 10392, 385, 408, 388, 3739, 452, 2607, 439, 7767, 27010, 16710, 576, 774, 4854, 4307, 774, 569, 7611, 385, 567, 944, 27010, 419, 33703, 6128, 385, 358, 410, 1476, 2149, 1565, 378, 6339, 611, 391, 12199, 441, 44942, 2242, 446, 3606, 363, 7705, 388, 363, 2424, 10490, 131, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [18]:
print(tokenized_whole_dataset.__len__())
print(tokenized_whole_dataset.__getitem__(970))
print(tokenizer.decode(tokenized_whole_dataset.__getitem__(970)['input_ids']))

14899311
{'input_ids': [65, 9499, 9061, 876, 3689, 571, 11911, 988, 415, 867, 911, 446, 922, 2507, 523, 415, 1565, 1908, 387, 456, 618, 10392, 385, 408, 388, 3739, 452, 2607, 439, 7767, 27010, 16710, 576, 774, 4854, 4307, 774, 569, 7611, 385, 567, 944, 27010, 419, 33703, 6128, 385, 358, 410, 1476, 2149, 1565, 378, 6339, 611, 391, 12199, 441, 44942, 2242, 446, 3606, 363, 7705, 388, 363, 2424, 10490, 131, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Data collator

In [19]:
from transformers import DataCollatorForWholeWordMask, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15, return_tensors='pt'
)

In [20]:
print(tokenized_whole_dataset.__getitem__(970))

col_test = data_collator.torch_call([tokenized_whole_dataset.__getitem__(970)])
print(col_test)

tokenizer.decode(
    col_test['input_ids'].tolist()[0])

{'input_ids': [65, 9499, 9061, 876, 3689, 571, 11911, 988, 415, 867, 911, 446, 922, 2507, 523, 415, 1565, 1908, 387, 456, 618, 10392, 385, 408, 388, 3739, 452, 2607, 439, 7767, 27010, 16710, 576, 774, 4854, 4307, 774, 569, 7611, 385, 567, 944, 27010, 419, 33703, 6128, 385, 358, 410, 1476, 2149, 1565, 378, 6339, 611, 391, 12199, 441, 44942, 2242, 446, 3606, 363, 7705, 388, 363, 2424, 10490, 131, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

"[CLS] Ha relax We aren't arguing But I see where you're coming from I always thought of as more[MASK] cuisine be in touch with everyone's feelings Leslie cares[MASK] she definitely knows she has shit to do And Leslie is NEVER planned to a T She almost always fucks up and wings it Lol Do you remember the hole in the ground arc?[SEP]<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [21]:
class CustomCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if control.should_log:
            print("Logging")
            control_copy = copy.deepcopy(control)
            print(f"Step {state.global_step}: training accuracy = {self._trainer.progress_bar.average['acc']}")
            #self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [22]:
from datasets import load_metric
metric = load_metric('accuracy')

def cal_acc(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric('accuracy')


In [23]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.total_element_rfp = 0
        self.total_correct_rfp = 0
        self.count = 0
        self.others = 0
        
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)

        # code for calculating accuracy
        if "labels" in inputs:
            preds = outputs.logits.detach().cpu()
            input_label = inputs['labels'].detach().cpu()
            
            """
            print("-------input-------")
            print(inputs['input_ids'].shape)
            print(inputs['labels'].shape)
            print(labels)
          #  print(labels.shape)
            
            print("-------preds-------")
            print(preds)
            print(preds.shape)
            print(preds.argmax(axis=-1))
            print(preds.argmax(axis=-1).shape)
            print(type(preds.argmax(axis=-1)))
            print(preds.argmax(axis=-1).reshape(-1))
            print(preds.argmax(axis=-1).reshape(-1).shape)
            
            print("-------ex-------")
            print(inputs['labels'])
            print(inputs['labels'].view(-1).shape)
            print(inputs['labels'].shape)
            print(inputs['labels'].view(-1,))
            """
            
            correct_rfp = preds.argmax(dim=-1).eq(input_label)
            correct_rfp_filter = input_label.ne(-100)  # 0 = self.vocab.pad_index
            correct_rfp = correct_rfp[correct_rfp_filter].sum().item()
            self.total_correct_rfp += correct_rfp
            self.total_element_rfp += correct_rfp_filter.sum().item()
            acc = self.total_correct_rfp / self.total_element_rfp * 100
            
            if self.state.global_step % (self.state.max_steps // self.state.num_train_epochs) ==0:
                print("========{} epoch========".format( 
                      self.state.global_step / (self.state.max_steps // self.state.num_train_epochs)))
                print("Global Step: ", self.state.global_step)
                print("Max Steps: ", self.state.max_steps)
                print("Num Train Epochs: ", self.state.num_train_epochs)
                print("acc: ", acc)
                print("================================")
                print("# total correct:", self.total_correct_rfp)
                print("# total element:", self.total_element_rfp)
                self.count = 0
                self.total_element_rfp = 0
                self.total_correct_rfp = 0
            
            
        # end code for calculating accuracy
                    
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

In [24]:
training_args = TrainingArguments(
    output_dir="/home/user/10TB/personalityAI/BigBird_MLMmodel",
    logging_dir= "/home/user/10TB/personalityAI/BigBird_MLMlog",
    num_train_epochs=5,
    learning_rate = 1e-4,
   # max_steps=1000,
    per_device_train_batch_size=4,
#    gradient_accumulation_steps = 16,
#    per_device_eval_batch_size = 16,
#    eval_accumulation_steps = 32,
    logging_strategy = "epoch",
    save_strategy = "epoch",
    lr_scheduler_type = "linear",
    dataloader_num_workers = 16,
    warmup_ratio = 0.1,
    weight_decay=0.01,
#    warmup_steps = 1643
#    evaluation_strategy = "steps", # need a eval_dataset
#    eval_steps = 10,
#    save_steps=1000,
#    save_total_limit=10,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_whole_dataset,
    compute_metrics=cal_acc,
)

In [None]:
#trainer.add_callback(CustomCallback(trainer))
trainer.train()

***** Running training *****
  Num examples = 14899311
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4656035
  Number of trainable parameters = 128111286


Global Step:  0
Max Steps:  4656035
Num Train Epochs:  5
acc:  50.0
# total correct: 47
# total element: 94


Step,Training Loss


In [None]:
trainer.save_model("/home/user/10TB/personalityAI/BigBird_final")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="/home/user/10TB/kostat_LM/kostat_model/BigBird_final",
    tokenizer= tokenizer
)

In [None]:
fill_mask("[MASK] 관리업은 [MASK] 관리․감독이나 \
          증권거래 활동 및 기타 금융기관의 활동을 규제 또는 감독하는 [MASK]을 말한다.")