In [1]:
import os
import random
import numpy as np
from tqdm import tqdm
import torch
from torch import nn, Tensor
import pdb
from typing import Optional, Tuple, Union
import logging
import math
import pandas as pd
from sklearn.metrics import accuracy_score

from transformers import TrainingArguments, Trainer, TrainerCallback, DefaultDataCollator
from transformers import AutoConfig, AlbertTokenizer, AlbertModel, AlbertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput

# os.environ["TOKENIZERS_PARALLELISM"] = "true"


from accelerate import Accelerator, skip_first_batches
from accelerate import __version__ as accelerate_version
from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin

2024-02-20 11:27:40.110007: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-20 11:27:40.221076: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-20 11:27:40.788471: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64:
2024-02-20 11:27:40.788529: W tensorflow/compiler/xla/strea

## Seed 고정

In [2]:
"""
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # current gpu seed
    torch.cuda.manual_seed_all(seed) # All gpu seed
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False  # True로 하면 gpu에 적합한 알고리즘을 선택함.

seed_everything(1004)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
"""

'\ndef seed_everything(seed:int = 1004):\n    random.seed(seed)\n    np.random.seed(seed)\n    os.environ["PYTHONHASHSEED"] = str(seed)\n    torch.manual_seed(seed)\n    torch.cuda.manual_seed(seed)  # current gpu seed\n    torch.cuda.manual_seed_all(seed) # All gpu seed\n    torch.backends.cudnn.deterministic = True  \n    torch.backends.cudnn.benchmark = False  # True로 하면 gpu에 적합한 알고리즘을 선택함.\n\nseed_everything(1004)\n\ndevice = torch.device(\'cuda\' if torch.cuda.is_available() else \'cpu\')\nprint(device)\n'

## Config

In [3]:
MODEL_NAME = 'albert-base-v2'
MODEL_PATH = '/home/user/10TB/personalityAI/Albert_cate_final_20'


# '/home/user/10TB/personalityAI/Albert_CLIP-like_final_20'
# '/home/user/10TB/personalityAI/ALBERT_rand3_final'
# '/home/user/10TB/personalityAI/ALBERT_suzuki_MLMfinal'
# '/home/user/10TB/personalityAI/Albert_rand1_final_20'
# '/home/user/10TB/personalityAI/Albert_rand2_final_20'

In [4]:
config = AutoConfig.from_pretrained(MODEL_NAME)  

print(config)

AlbertConfig {
  "_name_or_path": "albert-base-v2",
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.2",
  "type_vocab_size": 2,
  "vocab_size": 30000
}



In [5]:
tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)  # 토크나이저는 학습되어 있는 것으로, BERTTOKENIZE

In [6]:
tokenizer

AlbertTokenizer(name_or_path='albert-base-v2', vocab_size=30000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [7]:
tokenizer("Hello I'm World!"), tokenizer.decode(tokenizer.encode("Hello I'm World!"))

({'input_ids': [2, 10975, 31, 22, 79, 126, 187, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]},
 "[CLS] hello i'm world![SEP]")

In [8]:
tokenizer.model_max_length

512

In [9]:
class AlbertForCustomClassification(AlbertPreTrainedModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.num_labels = num_labels
        self.config = config
        self.albert = AlbertModel(config)
        self.dropout = nn.Dropout(config.classifier_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        loss_fct = nn.L1Loss()
        
        if labels is not None:
            # loss_fct = loss_fct(logits_O.view(-1, self.num_labels), labels[0])
            loss = loss_fct(logits.squeeze(), labels.squeeze())

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [10]:
model = AlbertForCustomClassification.from_pretrained(MODEL_PATH, 5)

print(model)

Some weights of the model checkpoint at /home/user/10TB/personalityAI/Albert_cate_final_20 were not used when initializing AlbertForCustomClassification: ['predictions.dense.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForCustomClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForCustomClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForCustomClassification were not initialized from the model checkpoint at /home/user/10TB/personalityAI/Albert_cate_final_20 and are new

AlbertForCustomClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, b

In [11]:
print(model.num_parameters())

11687429


In [12]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                                       Param #
AlbertForCustomClassification                                --
├─AlbertModel: 1-1                                           --
│    └─AlbertEmbeddings: 2-1                                 --
│    │    └─Embedding: 3-1                                   3,840,000
│    │    └─Embedding: 3-2                                   65,536
│    │    └─Embedding: 3-3                                   256
│    │    └─LayerNorm: 3-4                                   256
│    │    └─Dropout: 3-5                                     --
│    └─AlbertTransformer: 2-2                                --
│    │    └─Linear: 3-6                                      99,072
│    │    └─ModuleList: 3-7                                  7,087,872
│    └─Linear: 2-3                                           590,592
│    └─Tanh: 2-4                                             --
├─Dropout: 1-2                                               --
├─Line

## 데이터 준비

In [13]:
import pandas as pd

FI_train = pd.read_csv("/home/user/10TB/Data/FI/FI_train_pp.csv", index_col = 0)
FI_val = pd.read_csv("/home/user/10TB/Data/FI/FI_val_pp.csv", index_col = 0)
FI_test = pd.read_csv("/home/user/10TB/Data/FI/FI_test_pp.csv", index_col = 0)

In [None]:
FI_train['label'] = FI_train[['openness', 'conscientiousness', 
                              'extraversion', 'agreeableness', 
                              'neuroticism']].apply(
    lambda row: ' '.join(row.values.astype('str')), axis=1)

FI_val['label'] = FI_val[['openness', 'conscientiousness', 
                              'extraversion', 'agreeableness', 
                              'neuroticism']].apply(
    lambda row: ' '.join(row.values.astype('str')), axis=1)

FI_test['label'] = FI_test[['openness', 'conscientiousness', 
                              'extraversion', 'agreeableness', 
                              'neuroticism']].apply(
    lambda row: ' '.join(row.values.astype('str')), axis=1)

FI_train

In [35]:
FI_train['Script'][2]

'I actually got quite a few sets of black pens this year because I bought one pack I think I bought two packs actually that I really liked and then I found Some people at my work had these really cool pens that I liked a lot and I liked how they wrote'

In [15]:
class bigbird_Dataset(torch.utils.data.Dataset):
    def __init__(self, data, label, tokenizer):  # 전처리된 데이터 셋이 들어옴
        self.data = data
        self.tokenizer = tokenizer
        self.label = label

    def __getitem__(self, idx):
        # gradient 계산에 영향을 주지 않게 clone().detach() 실행
        
        text = self.data[idx]
        tokens = self.tokenizer(text, 
                                return_tensors="pt",  # pytorch.Tensor로 리턴
                                max_length=512, 
                                padding="max_length",  
                                truncation=True,  # max_length 넘어가면 버림)
                               )
        # tokens['label'] = torch.LongTensor([self.label[idx]])
        tokens['label'] = [float(i) for i in self.label[idx].split()]
        
        return tokens

    def __len__(self):  # 샘플 수
        return len(self.data)
    
    """
    class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, label):  # 전처리된 데이터 셋이 들어옴
        self.dataset = dataset
        self.label = label

    def __getitem__(self, idx):
        # gradient 계산에 영향을 주지 않게 clone().detach() 실행
        
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):  # 샘플 수
        return len(self.label)
    """

In [16]:
train_sen = FI_train['Script'].tolist()
train_label = FI_train['label'].tolist()

val_sen = FI_val['Script'].tolist()
val_label = FI_val['label'].tolist()

test_sen = FI_test['Script'].tolist()
test_label = FI_test['label'].tolist()

In [17]:
train_dataset = bigbird_Dataset(train_sen, train_label, tokenizer)
val_dataset = bigbird_Dataset(val_sen, val_label, tokenizer)
test_dataset = bigbird_Dataset(test_sen, test_label, tokenizer)

In [18]:
print(train_dataset.__len__())
print(train_dataset.__getitem__(97))
#print(train_dataset.__getitem__(970)['input_ids'].numpy())
# print(tokenizer.decode(train_dataset.__getitem__(970)['input_ids'].numpy()[0]))

5902
{'input_ids': tensor([[    2,   184,   175,    31,   257,    32,   101,    51, 25766,    53,
            18,  1030,   273,   652,  2038,    14,  1045, 25766,  6610,    30,
            50,   101,   652,   902,    31,  2066,    80,  1576,    26,   101,
            81,   974,    17,    31,   825,    16,  1077,   273,   125,   253,
           284,  2593,    17,    28,   951,   101,   712,    20,  1417,   902,
          1576,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,    

In [21]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import multilabel_confusion_matrix
from transformers import EvalPrediction
from sklearn.metrics import mean_absolute_error
import torch

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    
    MAE = 1 - mean_absolute_error(labels, preds)
    
    
    return {
        '1 - MAE' : MAE
    }

In [22]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        
        outputs = model(**inputs)
        
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

In [23]:
training_args = TrainingArguments(
    output_dir="/home/user/10TB/COLING2024/10FI_cate/model/cate_2_ALBERT_FT_1-head_mae",
    logging_dir= "/home/user/10TB/COLING2024/10FI_cate/model/cate_2_ALBERT_FT_1-head_mae_log",
    num_train_epochs=20,
    learning_rate = 3e-5,
   # max_steps=1000,
    per_device_train_batch_size=32,
#    gradient_accumulation_steps = 16,
    per_device_eval_batch_size = 32,
#    eval_accumulation_steps = 32,
    logging_strategy = "epoch",
    save_strategy = "epoch",
    lr_scheduler_type = "linear",
    dataloader_num_workers = 12,
#    warmup_ratio = 0.1,
    weight_decay=0.01,
    evaluation_strategy='epoch',
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DefaultDataCollator(return_tensors = "pt"),
    compute_metrics=compute_metrics,
)

In [24]:
#trainer.add_callback(CustomCallback(trainer))
trainer.train()



Epoch,Training Loss,Validation Loss,1 - mae
1,0.447,0.386521,0.61361
2,0.3209,0.210157,0.789829
3,0.2319,0.220748,0.779166
4,0.1531,0.132232,0.867743
5,0.1243,0.119014,0.880954
6,0.1197,0.117116,0.882865
7,0.1147,0.115766,0.884227
8,0.112,0.114417,0.885582
9,0.1076,0.114482,0.885497
10,0.1063,0.114243,0.885746




TrainOutput(global_step=100, training_loss=0.13549407988786696, metrics={'train_runtime': 110.9672, 'train_samples_per_second': 106.338, 'train_steps_per_second': 0.901, 'total_flos': 282081228595200.0, 'train_loss': 0.13549407988786696, 'epoch': 20.0})

## Test

In [19]:
model = AlbertForCustomClassification.from_pretrained("/home/user/10TB/COLING2024/10FI_cate/model/cate_2_ALBERT_FT_1-head_mae/checkpoint-50", 5)

In [20]:
tokens = train_dataset.__getitem__(97)
print(tokens)

{'input_ids': tensor([[    2,   184,   175,    31,   257,    32,   101,    51, 25766,    53,
            18,  1030,   273,   652,  2038,    14,  1045, 25766,  6610,    30,
            50,   101,   652,   902,    31,  2066,    80,  1576,    26,   101,
            81,   974,    17,    31,   825,    16,  1077,   273,   125,   253,
           284,  2593,    17,    28,   951,   101,   712,    20,  1417,   902,
          1576,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [36]:
model.num_parameters()

11687429

In [26]:
def test_compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    
    labels = labels.T
    preds = preds.T
    
    MAE = []
    
    for i in range(5):
        MAE.append(1 - mean_absolute_error(labels[i], preds[i]))
    
    
    return {
        '1 - MAE' : MAE
    }

In [31]:
eval_trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DefaultDataCollator(return_tensors = "pt"),
    compute_metrics=test_compute_metrics,
)

In [29]:
output = eval_trainer.predict(test_dataset)

prediction_csv = pd.DataFrame(output.predictions, columns=['O', 'C', 'E', 'A', 'N'])
label_csv = pd.DataFrame(output.label_ids, columns=['O', 'C', 'E', 'A', 'N'])



In [30]:
prediction_csv.to_csv("/home/user/10TB/COLING2024/10FI_cate/result_csv/cate_2_predictions.csv")
label_csv.to_csv("/home/user/10TB/COLING2024/10FI_cate/result_csv/cate_2_labels.csv")

In [32]:
eval_trainer.evaluate(test_dataset)



Trainer is attempting to log a value of "[0.8857609927654266, 0.8822235092520714, 0.8846857324242592, 0.9023622646927834, 0.8771406263113022]" of type <class 'list'> for key "eval/1 - MAE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.11383680254220963,
 'eval_1 - MAE': [0.8857609927654266,
  0.8822235092520714,
  0.8846857324242592,
  0.9023622646927834,
  0.8771406263113022],
 'eval_runtime': 1.2604,
 'eval_samples_per_second': 156.294,
 'eval_steps_per_second': 1.587}