Skip to content

Commit 42285f0

Browse files
authoredJul 16, 2023
Merge pull request #16 from boostcampaitech5/feat/#2-compute_metrics
Feat/#2 compute metrics
2 parents c6a8789 + 9bcbee2 commit 42285f0

File tree

5 files changed

+95
-0
lines changed

5 files changed

+95
-0
lines changed
 

Diff for: ‎finetuning/config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ weight_decay: 0.
2121
warmup_ratio: 0.1
2222
lr_scheduler_type: linear
2323
resume_from_checkpoint: None
24+
use_compute_metrics: False
2425

2526
# lora hyperparams
2627
lora_r: 8

Diff for: ‎finetuning/train.py

+5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast, BitsAndBytesConfig
1616
from utils.data_loader import *
1717
from utils.arguments import TrainArguments
18+
from utils.compute_metrics import *
1819

1920

2021
def train(train_args: TrainArguments):
@@ -122,6 +123,7 @@ def train(train_args: TrainArguments):
122123
data_collator=transformers.DataCollatorForSeq2Seq(
123124
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
124125
),
126+
compute_metrics=train_compute_metrics if train_args.use_compute_metrics else None,
125127
)
126128
model.config.use_cache = False
127129

@@ -135,6 +137,9 @@ def train(train_args: TrainArguments):
135137

136138
with torch.autocast("cuda"):
137139
trainer.train(resume_from_checkpoint=train_args.resume_from_checkpoint)
140+
141+
evaluation_result = trainer.evaluate(eval_dataset=val_data)
142+
print(evaluation_result)
138143

139144
model.save_pretrained(train_args.finetune_dir)
140145

Diff for: ‎finetuning/utils/arguments.py

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class TrainArguments(BaseModel):
2424
warmup_ratio: float = 0.1
2525
lr_scheduler_type: str = 'linear'
2626
resume_from_checkpoint: str = None
27+
use_compute_metrics: bool = False
2728
# lora hyperparams
2829
lora_r: int = 32
2930
lora_alpha: int = 64

Diff for: ‎finetuning/utils/compute_metrics.py

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from datasets import load_metric
2+
from transformers import GPTNeoXTokenizerFast, GPTNeoXForCausalLM
3+
import numpy as np
4+
from rouge import Rouge
5+
from statistics import geometric_mean
6+
import torch
7+
from tqdm import tqdm
8+
9+
10+
def train_compute_metrics(pred):
11+
12+
model = GPTNeoXForCausalLM.from_pretrained('nlpai-lab/kullm-polyglot-12.8b-v2')
13+
14+
logits = torch.tensor(pred.predictions.argmax(-1).flatten(), dtype=torch.int64)
15+
logits = logits.unsqueeze(0) # torch.Size([1, 35200])
16+
17+
max_length = model.config.max_position_embeddings # 2048
18+
stride = 1024
19+
seq_len = logits.size(1)
20+
21+
nlls = []
22+
prev_end_loc = 0
23+
for begin_loc in tqdm(range(0, seq_len, stride)):
24+
end_loc = min(begin_loc + max_length, seq_len)
25+
trg_len = end_loc - prev_end_loc # 마지막 루프의 스트라이드 값과 다를 수 있음
26+
input_ids = logits[:, begin_loc:end_loc]
27+
target_ids = input_ids.clone()
28+
target_ids[:, :-trg_len] = -100
29+
30+
with torch.no_grad():
31+
outputs = model(input_ids, labels=target_ids)
32+
33+
# 손실은 모든 유효한 레이블에 대한 평균값을 구하는 교차 엔트로피(cross entropy)로 계산됩니다.
34+
# 나이브 베이지안 모델은 내부적으로 레이블을 왼쪽으로 1개씩 밀기 때문에, (타켓 - 1)개 만큼의 레이블에 대해 손실을 계산합니다.
35+
neg_log_likelihood = outputs.loss
36+
37+
nlls.append(neg_log_likelihood)
38+
prev_end_loc = end_loc
39+
if end_loc == seq_len:
40+
break
41+
42+
ppl = torch.exp(torch.stack(nlls).mean())
43+
44+
return {'perplexity':ppl}
45+
46+
47+
def test_compute_metrics(pred):
48+
tokenizer = GPTNeoXTokenizerFast.from_pretrained('nlpai-lab/kullm-polyglot-12.8b-v2')
49+
50+
# 사용할 metric을 불러옵니다.
51+
metric_bleu = load_metric("sacrebleu")
52+
metric_meteor = load_metric("meteor")
53+
metric_rouge = Rouge(metrics=["rouge-1", "rouge-2", "rouge-3", "rouge-4", "rouge-5", "rouge-l"])
54+
metric_bertscore = load_metric("bertscore")
55+
56+
# 학습에서 산출된 pred를 preds(모델이 생성)와 label(정답 데이터)로 분리합니다.
57+
preds = pred.predictions.argmax(-1)
58+
labels = pred.label_ids
59+
labels = np.where(pred.label_ids != -100, labels, tokenizer.pad_token_id)
60+
61+
scores = {
62+
'sacre_bleu': [],
63+
'meteor': [],
64+
'rouge_l_f1': [],
65+
'bert_score_f1': [],
66+
}
67+
68+
for i in range(len(preds)):
69+
decoded_preds = tokenizer.decode(preds[i], skip_special_tokens=True)
70+
decoded_labels = tokenizer.decode(labels[i], skip_special_tokens=True)
71+
if "### 응답:" in decoded_preds:
72+
decoded_preds = decoded_preds.split('### 응답:\n')[1][:-1]
73+
74+
bleu_score = metric_bleu.compute(predictions=[decoded_preds], references=[[decoded_labels]])["score"]
75+
meteor_score = metric_meteor.compute(predictions=[decoded_preds], references=[decoded_labels])["meteor"]
76+
rouge_scores = metric_rouge.get_scores(decoded_preds, decoded_labels, avg=True)["rouge-l"]['f']
77+
bert_score = metric_bertscore.compute(predictions=[decoded_preds], references=[decoded_labels], lang='ko')["f1"][0]
78+
79+
scores['sacre_bleu'].append(bleu_score / 100)
80+
scores['meteor'].append(meteor_score)
81+
scores['rouge_l_f1'].append(rouge_scores)
82+
scores['bert_score_f1'].append(bert_score)
83+
84+
scores = {k: geometric_mean(v) for k, v in scores.items()}
85+
86+
return {k: round(v, 5) for k, v in scores.items()}

Diff for: ‎finetuning/utils/data_loader.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ def load_and_preprocess_data(train_args: TrainArguments, tokenizer: GPTNeoXToken
1111
else:
1212
data = load_dataset(train_args.data_path)
1313

14+
print(data)
15+
1416
prompter = Prompter(template_name = train_args.prompt_template_name,
1517
verbose = False)
1618

0 commit comments

Comments
 (0)
Please sign in to comment.