# 1. Set Environments

In [1]:
import wandb
import os
os.environ["WANDB_PROJECT"]="Machin Translator_01"

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maeolian83[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()


login(token= os.environ["HF_TOKEN"])

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/aeolian83/.cache/huggingface/token
Login successful


# 2. Set Datasets

In [2]:
from datasets import Dataset, DatasetDict
import pickle
import os
import time


with open('./data/train_data_300.pkl', 'rb') as file:
    train_data = pickle.load(file)
len(train_data)

with open('./data/validation_data_28.pkl', 'rb') as file:
    test_data = pickle.load(file)
len(test_data)

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# DatasetDict로 "train"과 "test" 데이터셋 묶기
dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

# 3. Checkpoints Listing

In [None]:
checkpoint_dir = "../checkpoint/translate_machine_llama3ko_intsuct_origindata300_02"

# 디렉터리 이름에서 숫자를 추출하는 함수
def extract_number(directory_name):
    return int(directory_name.split('-')[-1])


# 디렉터리 항목을 숫자 순서대로 정렬
checkpoints = []
with os.scandir(checkpoint_dir) as entries:
    for entry in entries:
        if entry.is_dir():
            checkpoints.append(entry.name)

# 숫자 순서대로 정렬
checkpoints.sort(key=extract_number)

# 4. Evaluate

### (1) Function

In [None]:
import evaluate
import re
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

metric = evaluate.load("sacrebleu")

model_id = "beomi/Llama-3-KoEn-8B-Instruct-preview"
device_map = {"": 0}
cache_model_dir="/mnt/t7/.cache/huggingface/models"

def make_prompt(text):
    return f'''Translate input sentence to Korean
### Input: {text}
### Translated:'''


def clean_text(text):
    return re.sub(r'\(.*?\)', '', text)


def counter_terms(terms, text):
    if not isinstance(terms, list):
        terms = terms.split(", ")

    return sum(text.lower().count(term.lower()) for term in terms)


def generate(inputs, tokenizer, model):
    examples = []

    for input in inputs:
        prompt = make_prompt(input)
        examples.append(prompt)

    example_batch = tokenizer(examples, return_tensors="pt", padding=True).to(model.device)

    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**example_batch, max_new_tokens = 512, pad_token_id=tokenizer.pad_token_id)

    outputs = [tokenizer.decode(t[len(tokenizer.encode(examples[i])):], skip_special_tokens=True) for i, t in enumerate(output_tokens)]

    return outputs


def evaluate_score(datasets, outputs):
    if not len(datasets["korean"]) == len(outputs):
        pass

    inputs = datasets["english"]
    labels = datasets["korean"]
    terms = datasets["terms"]

    results = []

    for input, output, label, term in zip(inputs, outputs, labels, terms):
        # print(input)
        # print(output)
        # print(label)
        # print(term)

        label_clean = clean_text(label)
        prediction_clean = clean_text(output)
        result = metric.compute(predictions=[prediction_clean], references=[label_clean])

        input_count = counter_terms(term, input)
        predic_count = counter_terms(term, output)

        weight = 1.0 if predic_count > input_count else predic_count / input_count

        results.append((result['score'], weight, weight * result['score']))
    
    return results


model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map='auto', cache_dir=cache_model_dir)

### (2) Evaluate Model

In [None]:
# 시작 시간 기록
start_time = time.time()

bleu_scores = {}

for checkpoint in checkpoints[10:20]:
    print('#' * 50)
    print(checkpoint)
    print('#' * 50)

    peft_model_id = os.path.join(checkpoint_dir, checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
    tokenizer.padding_side = "left"
    model.resize_token_embeddings(len(tokenizer))

    loaded_model = PeftModel.from_pretrained(model=model, model_id=peft_model_id)

    results = []
    for i in range(0, len(dataset_dict["test"]), 6):
        dataset = dataset_dict["test"][i:i+6]

        outputs = generate(dataset["english"], tokenizer, loaded_model)

        results += evaluate_score(dataset, outputs)

    bleus, weights, weighted_bleus = zip(*results)
    score = {
        "bleu": np.mean(bleus),
        "weight": np.mean(weights),
        "weighted_bleus": np.mean(weighted_bleus)
        }
    bleu_scores[checkpoint] = score

    del peft_model_id, tokenizer, loaded_model
    torch.cuda.empty_cache()

df = pd.DataFrame.from_dict(bleu_scores, orient='index')

# 종료 시간 기록
end_time = time.time()

print(f"Execution Time: {end_time - start_time} seconds")
print(df)