Skip to content

Commit

Permalink
Merge pull request #16 from boostcampaitech5/feat/#2-compute_metrics
Browse files Browse the repository at this point in the history
Feat/#2 compute metrics
  • Loading branch information
jihye-moon authored Jul 16, 2023
2 parents c6a8789 + 9bcbee2 commit 42285f0
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 0 deletions.
1 change: 1 addition & 0 deletions finetuning/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ weight_decay: 0.
warmup_ratio: 0.1
lr_scheduler_type: linear
resume_from_checkpoint: None
use_compute_metrics: False

# lora hyperparams
lora_r: 8
Expand Down
5 changes: 5 additions & 0 deletions finetuning/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast, BitsAndBytesConfig
from utils.data_loader import *
from utils.arguments import TrainArguments
from utils.compute_metrics import *


def train(train_args: TrainArguments):
Expand Down Expand Up @@ -122,6 +123,7 @@ def train(train_args: TrainArguments):
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
compute_metrics=train_compute_metrics if train_args.use_compute_metrics else None,
)
model.config.use_cache = False

Expand All @@ -135,6 +137,9 @@ def train(train_args: TrainArguments):

with torch.autocast("cuda"):
trainer.train(resume_from_checkpoint=train_args.resume_from_checkpoint)

evaluation_result = trainer.evaluate(eval_dataset=val_data)
print(evaluation_result)

model.save_pretrained(train_args.finetune_dir)

Expand Down
1 change: 1 addition & 0 deletions finetuning/utils/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class TrainArguments(BaseModel):
warmup_ratio: float = 0.1
lr_scheduler_type: str = 'linear'
resume_from_checkpoint: str = None
use_compute_metrics: bool = False
# lora hyperparams
lora_r: int = 32
lora_alpha: int = 64
Expand Down
86 changes: 86 additions & 0 deletions finetuning/utils/compute_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from datasets import load_metric
from transformers import GPTNeoXTokenizerFast, GPTNeoXForCausalLM
import numpy as np
from rouge import Rouge
from statistics import geometric_mean
import torch
from tqdm import tqdm


def train_compute_metrics(pred):

model = GPTNeoXForCausalLM.from_pretrained('nlpai-lab/kullm-polyglot-12.8b-v2')

logits = torch.tensor(pred.predictions.argmax(-1).flatten(), dtype=torch.int64)
logits = logits.unsqueeze(0) # torch.Size([1, 35200])

max_length = model.config.max_position_embeddings # 2048
stride = 1024
seq_len = logits.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
end_loc = min(begin_loc + max_length, seq_len)
trg_len = end_loc - prev_end_loc # 마지막 루프의 스트라이드 값과 다를 수 있음
input_ids = logits[:, begin_loc:end_loc]
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100

with torch.no_grad():
outputs = model(input_ids, labels=target_ids)

# 손실은 모든 유효한 레이블에 대한 평균값을 구하는 교차 엔트로피(cross entropy)로 계산됩니다.
# 나이브 베이지안 모델은 내부적으로 레이블을 왼쪽으로 1개씩 밀기 때문에, (타켓 - 1)개 만큼의 레이블에 대해 손실을 계산합니다.
neg_log_likelihood = outputs.loss

nlls.append(neg_log_likelihood)
prev_end_loc = end_loc
if end_loc == seq_len:
break

ppl = torch.exp(torch.stack(nlls).mean())

return {'perplexity':ppl}


def test_compute_metrics(pred):
tokenizer = GPTNeoXTokenizerFast.from_pretrained('nlpai-lab/kullm-polyglot-12.8b-v2')

# 사용할 metric을 불러옵니다.
metric_bleu = load_metric("sacrebleu")
metric_meteor = load_metric("meteor")
metric_rouge = Rouge(metrics=["rouge-1", "rouge-2", "rouge-3", "rouge-4", "rouge-5", "rouge-l"])
metric_bertscore = load_metric("bertscore")

# 학습에서 산출된 pred를 preds(모델이 생성)와 label(정답 데이터)로 분리합니다.
preds = pred.predictions.argmax(-1)
labels = pred.label_ids
labels = np.where(pred.label_ids != -100, labels, tokenizer.pad_token_id)

scores = {
'sacre_bleu': [],
'meteor': [],
'rouge_l_f1': [],
'bert_score_f1': [],
}

for i in range(len(preds)):
decoded_preds = tokenizer.decode(preds[i], skip_special_tokens=True)
decoded_labels = tokenizer.decode(labels[i], skip_special_tokens=True)
if "### 응답:" in decoded_preds:
decoded_preds = decoded_preds.split('### 응답:\n')[1][:-1]

bleu_score = metric_bleu.compute(predictions=[decoded_preds], references=[[decoded_labels]])["score"]
meteor_score = metric_meteor.compute(predictions=[decoded_preds], references=[decoded_labels])["meteor"]
rouge_scores = metric_rouge.get_scores(decoded_preds, decoded_labels, avg=True)["rouge-l"]['f']
bert_score = metric_bertscore.compute(predictions=[decoded_preds], references=[decoded_labels], lang='ko')["f1"][0]

scores['sacre_bleu'].append(bleu_score / 100)
scores['meteor'].append(meteor_score)
scores['rouge_l_f1'].append(rouge_scores)
scores['bert_score_f1'].append(bert_score)

scores = {k: geometric_mean(v) for k, v in scores.items()}

return {k: round(v, 5) for k, v in scores.items()}
2 changes: 2 additions & 0 deletions finetuning/utils/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ def load_and_preprocess_data(train_args: TrainArguments, tokenizer: GPTNeoXToken
else:
data = load_dataset(train_args.data_path)

print(data)

prompter = Prompter(template_name = train_args.prompt_template_name,
verbose = False)

Expand Down

0 comments on commit 42285f0

Please sign in to comment.