# 1. Setup and Initialization

## 1.1 Importing Necessary Packages

In [1]:
# System
import wandb
import os
import json
import gc

# Environment
from dotenv import load_dotenv
from huggingface_hub import login

# LLM packages
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, set_seed
import numpy as np


# Set Seed
train_seed = 2002
set_seed(train_seed)

# 캐시 디렉토리 설정
DATA_CACHE_DIR = "/mnt/t7/.cache/huggingface/datasets"
MODEL_CACHE_DIR = "/mnt/t7/.cache/huggingface/models"

## 1.2 Logging into Hugging Face Hub and Weights & Biases

In [2]:
os.environ["WANDB_PROJECT"]="Graduate Project mbart"
wandb.login()

load_dotenv("/mnt/t7/dnn/llm_practicing/.env")
login(token= os.environ["HF_TOKEN"])

[34m[1mwandb[0m: Currently logged in as: [33maeolian83[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# 2. Loading and Preparing the Dataset

In [3]:
dataset_name = "aeolian83/PTT_wit_Latex_1"

dataset_dict = load_dataset(dataset_name, cache_dir=DATA_CACHE_DIR)

In [4]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['term', 'english', 'korean'],
        num_rows: 1432
    })
})

In [5]:
shuffled_train = dataset_dict["train"].shuffle(seed=42)

dataset_dict = DatasetDict({
    "train": shuffled_train,
    # 다른 split도 있으면 추가 (예: "validation": dataset["validation"])
})

In [6]:
checkpoint_dir="/mnt/t7/dnn/paper_translator2/test/checkpoint/mbart-en-ko"

In [7]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="en_XX", tgt_lang="ko_KR")
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [8]:
def preprocess_function(examples):
    inputs = [ex for ex in examples['english']]  # Source language texts
    targets = [ex for ex in examples['korean']]  # Target language texts
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['term', 'english', 'korean', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1432
    })
})

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir=checkpoint_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=4, 
    gradient_accumulation_steps=4, 
    weight_decay=0.01,
    num_train_epochs=5,
    logging_steps = 20,
    save_steps = 40,
    predict_with_generate=True,
    bf16=True,
)

In [11]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
20,6.6679
40,4.9588
60,3.3362
80,1.7994
100,0.6498
120,0.2679
140,0.1824
160,0.1766
180,0.1604
200,0.1215




TrainOutput(global_step=450, training_loss=0.8658599135610793, metrics={'train_runtime': 716.3935, 'train_samples_per_second': 9.995, 'train_steps_per_second': 0.628, 'total_flos': 7758324569210880.0, 'train_loss': 0.8658599135610793, 'epoch': 5.0})

In [12]:
hub_model_id = "aeolian83/mbart-en-ko-ptt-latex" 

In [13]:
path = "/mnt/t7/dnn/paper_translator2/test/model/m-bart-ptt"

In [14]:
model.save_pretrained(path)
tokenizer.save_pretrained(path)

('/mnt/t7/dnn/paper_translator2/test/model/m-bart-ptt/tokenizer_config.json',
 '/mnt/t7/dnn/paper_translator2/test/model/m-bart-ptt/special_tokens_map.json',
 '/mnt/t7/dnn/paper_translator2/test/model/m-bart-ptt/sentencepiece.bpe.model',
 '/mnt/t7/dnn/paper_translator2/test/model/m-bart-ptt/added_tokens.json')

In [15]:
model.push_to_hub(hub_model_id)
tokenizer.push_to_hub(hub_model_id)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aeolian83/mbart-en-ko-ptt-latex/commit/9156023af5409077d31a8e40a173bd432a61c45a', commit_message='Upload tokenizer', commit_description='', oid='9156023af5409077d31a8e40a173bd432a61c45a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aeolian83/mbart-en-ko-ptt-latex', endpoint='https://huggingface.co', repo_type='model', repo_id='aeolian83/mbart-en-ko-ptt-latex'), pr_revision=None, pr_num=None)