In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, PreTrainedTokenizerFast
import pandas as pd
from torch.utils.data import Dataset
from nltk.translate.bleu_score import corpus_bleu
from sklearn.model_selection import train_test_split
import torch
import os

In [2]:
os.environ['WANDB_WATCH'] = 'false'
os.environ['WANDB_SILENT']="true"

In [3]:
# gpt2 base를 사용합니다.
model_name = "skt/kogpt2-base-v2"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name, bos_token='<s>', 
                                                    eos_token='</s>', unk_token='<unk>',pad_token='<pad>', mask_token='<mask>')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [4]:
# 새로운 스페셜 토큰 생성
keyword_start_marker = '<k>'
keyword_end_marker = '</k>'
tokenizer.add_special_tokens({
    "additional_special_tokens": [
        keyword_start_marker,
        keyword_end_marker
    ]
})

2

In [5]:
# 스페셜 토큰이 새롭게 생긴 tokenizer를 저장합니다.
tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/tokenizer.json')

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 스페셜 토큰만큼 모델 리사이즈
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(tokenizer.vocab_size + 2)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51202, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [7]:
# keyword가 담겨있는 데이터를 로드
poem_df = pd.read_csv('/opt/ml/data/final_poem.csv')

In [8]:
# train, valid split
train_poem, valid_poem = train_test_split(poem_df,test_size = 0.1, random_state=42)
train_poem = train_poem.reset_index()
valid_poem = valid_poem.reset_index()

In [9]:
def add_emo_tag(df):
    # <e> k1, k2, k3 </e> poem
    text = df['text']
    emo_tag = df['key_word']
    tagged_text = keyword_start_marker + emo_tag + keyword_end_marker + text
    return tagged_text

In [10]:
# 키워드 추출이 안돼서(명사 추출이 안돼서) keyword가 None인 경우가 존재
# 그 경우 train, valid data에서 제외
train_data = []
valid_data = []

for i in range(len(train_poem)):
    try:
        train_data.append(add_emo_tag(train_poem.iloc[i]))
    except:
        continue

for i in range(len(valid_poem)):
    try:
        valid_data.append(add_emo_tag(valid_poem.iloc[i]))
    except:
        continue

In [11]:
train_data = tokenizer(train_data, padding= True, return_tensors='pt')
valid_data = tokenizer(valid_data, padding= True, return_tensors='pt')

In [12]:
# input_ids == labels
# 생성모델이므로...

class PoemDataset(Dataset):
    def __init__(self,data) -> None:
        super().__init__()
        self.data = data
    def __getitem__(self, index):
        item = {k : v[index] for k, v in self.data.items()}
        item['labels'] = self.data.input_ids[index]
        return item
    def __len__(self):
        return len(self.data.input_ids)

In [13]:
train_dataset = PoemDataset(train_data)
valid_dataset = PoemDataset(valid_data)

In [14]:
# 시의 bleu를 측정하는 것이 과연 옳은가?
# loss 측정 하는게 시의 전체적인 확률 분포를 확인 하는 측면에서 더 맞는 접근이 아닌가?

def compute_metrics(pred):
  """ validation을 위한 metrics function """
  labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
  preds = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)


  # labels -> [sen1, sen2, sen3 ...]
  # list_of_references -> [[sen1],[sen2],[sen3]...]
  list_of_references = []
  for i in range(len(labels)):
    list_of_references.append([labels[i]])
      
  # calculate blue4
  blue4 = corpus_bleu(list_of_references= list_of_references, hypotheses = preds)

  return {
      'blue4': blue4
  }

In [15]:
training_args = TrainingArguments(
    #predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps = 4,
    overwrite_output_dir=True,
    fp16=True,
    #load_best_model_at_end=True,
    output_dir='./model',
    logging_steps=100,
    save_steps=100,
    eval_steps=100,
    num_train_epochs=10,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    #compute_metrics = compute_metrics,
)
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using amp fp16 backend
***** Running training *****
  Num examples = 26352
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 4110
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss
100,1.1721,1.10051
200,0.7385,1.087253
300,0.7402,1.077531
400,0.7439,1.07191
500,0.7122,1.073024
600,0.6883,1.067213
700,0.7144,1.063572
800,0.7062,1.060801
900,0.6923,1.064001
1000,0.6717,1.061196


***** Running Evaluation *****
  Num examples = 2926
  Batch size = 16
Saving model checkpoint to ./model/checkpoint-100
Configuration saved in ./model/checkpoint-100/config.json
Model weights saved in ./model/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./model/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./model/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2926
  Batch size = 16
Saving model checkpoint to ./model/checkpoint-200
Configuration saved in ./model/checkpoint-200/config.json
Model weights saved in ./model/checkpoint-200/pytorch_model.bin
tokenizer config file saved in ./model/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./model/checkpoint-200/special_tokens_map.json
Deleting older checkpoint [model/checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2926
  Batch size = 16
Saving model checkpoint to ./model/checkpoint-300
Configurati

TrainOutput(global_step=4110, training_loss=0.6465582943890796, metrics={'train_runtime': 6408.382, 'train_samples_per_second': 41.121, 'train_steps_per_second': 0.641, 'total_flos': 6.7229623296e+16, 'train_loss': 0.6465582943890796, 'epoch': 10.0})

In [17]:
model.save_pretrained('./model')

Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
