# 1. Check Env

In [1]:
!conda env list

# conda environments:
#
VAdepthENV               /home/013907062/.conda/envs/VAdepthENV
cmpe249                  /home/013907062/.conda/envs/cmpe249
env_onmttf               /home/013907062/.conda/envs/env_onmttf
koen_base                /home/013907062/.conda/envs/koen_base
newDepth                 /home/013907062/.conda/envs/newDepth
test                     /home/013907062/.conda/envs/test
wmt_infer             *  /home/013907062/.conda/envs/wmt_infer
base                     /opt/ohpc/pub/apps/anaconda/3.9
stylegan2                /opt/ohpc/pub/apps/anaconda/3.9/envs/stylegan2



In [2]:
!pip show huggingface_hub

Name: huggingface-hub
Version: 0.17.1
Summary: Client library to download and publish models, datasets and other repos on the huggingface.co hub
Home-page: https://github.com/huggingface/huggingface_hub
Author: Hugging Face, Inc.
Author-email: julien@huggingface.co
License: Apache
Location: /home/013907062/.conda/envs/wmt_infer/lib/python3.10/site-packages
Requires: filelock, fsspec, packaging, pyyaml, requests, tqdm, typing-extensions
Required-by: accelerate, autonlp, datasets, transformers


In [None]:
'''
!pip install transformers
!pip install datasets
!pip install sacrebleu
!pip install sentencepiece
!pip install accelerate -U
'''

In [None]:
'''
!pip install transformers -U
!pip install huggingface_hub -U
'''

In [None]:
'''
!pip install --upgrade accelerate
'''

In [2]:
import pandas as pd
import numpy as np
import multiprocessing
from easydict import EasyDict
import yaml
from datasets import load_dataset, load_metric, Dataset

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm


# 2. Train EN to KO

## 2.1 train

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # or "0,1" for multiple GPUs

In [6]:
with open("config_enko.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])

print("[LOG1] config yaml done")
metric = load_metric("sacrebleu")
print("[LOG2] metric done")
dset = load_dataset("csv", data_files={'train':'idioms__train.csv','valid': 'idioms__eval.csv'})
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
print("[LOG3] load_dataset and tokenizer loading done")

In [23]:
def preprocess_function(examples):
    inputs = examples[CFG.src_language]
    targets = examples[CFG.tgt_language]
    model_inputs = tokenizer(inputs, max_length=CFG.max_token_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=CFG.max_token_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


#tokenization...
tokenized_datasets = dset.map(preprocess_function, batched=True, num_proc=multiprocessing.cpu_count())
print("[LOG] tokenized_datasets done")

#load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name)

#for logging
str_model_name = CFG.model_name.split("/")[-1]
run_name = f"{str_model_name}-finetuned-{CFG.src_language}-to-{CFG.tgt_language}"
print("[LOG] run_name", run_name)

training_args = Seq2SeqTrainingArguments(
    run_name,
    learning_rate=CFG.learning_rate,
    weight_decay=CFG.weight_decay,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.valid_batch_size,
    evaluation_strategy=CFG.evaluation_strategy,
    # eval_steps=CFG.eval_steps,
    save_steps=CFG.save_steps,
    num_train_epochs=CFG.num_epochs,
    save_total_limit=CFG.num_checkpoints,
    predict_with_generate=True,
    fp16=CFG.fp16,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    logging_steps=CFG.logging_steps,
)

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print("[LOG YJ] Trainer Ready DONE!")

Map (num_proc=56): 100%|███████████████████████████████████████████████████████████████████| 2341/2341 [00:00<00:00, 2878.00 examples/s]
Map (num_proc=56): 100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 162.66 examples/s]


[LOG] tokenized_datasets done
[LOG] run_name KE-T5-En2Ko-Base-finetuned-en-to-ko
[LOG YJ] Trainer Ready DONE!


In [24]:
trainer.train()
trainer.evaluate()
trainer.save_model(CFG.save_path)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,No log,2.549204,12.02,14.15
2,No log,2.410879,10.6257,14.17
2,No log,2.340548,11.8415,14.08
4,No log,2.313292,12.2363,13.99
4,No log,2.321245,12.6134,13.99
6,No log,2.33664,14.838,13.88
6,No log,2.335986,13.994,13.92
8,No log,2.345043,13.6547,13.86
8,2.043600,2.355313,14.4929,14.01
9,2.043600,2.361259,14.5033,13.92


## 2.2 evaluation

In [25]:
import pandas as pd

df_test = pd.read_csv("idioms__test.csv")
display(df_test.head())
src_text = df_test['en'].values.tolist()

# Read config.yaml file
with open("config_enko.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])

model_name = CFG.inference_model_name
result_path = CFG.save_path

tokenizer = AutoTokenizer.from_pretrained(result_path)
model = AutoModelForSeq2SeqLM.from_pretrained(result_path)

Unnamed: 0,en,ko
0,"Once upon a time, there were three beautiful b...","옛날 옛적에, 세 마리의 예쁜 나비가 있었어요."
1,I felt like I have millions butterflies in my ...,너무 긴장 한 것 같았어요.
2,The deal was completely open and above board.,거래는 완전히 공개되었고 명백했습니다.
3,I left the keys on the board on your porch.,현관 게시판에 열쇠를 두고 왔어요.
4,clean your toys in the living room. no buts!,어서 거실에 있는 장난감 정리하세요. 토 달지 말고!


In [26]:
translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)
#print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])



In [27]:
output = []
for t in translated:
    output.append(tokenizer.decode(t, skip_special_tokens=True))
    
df_test['predictions'] = output
df_test.to_csv("results_enko.csv")

In [28]:
## ===================================SIMPLE CHECK=========================================== ##

src_text = ["The man had egg on him today as well as yesterday.",
            "I am peachy",
            "He started new business one year ago. As I know it, he has made a lot of dough.",
            "There's something odd about him, but I can't quite put my finger on it.",
            "She didn’t know what was causing the problem, but she finally put her finger on it.",
            "I will play it by ear.",
            "I've got butterflies in my stomach.",
            "The crowd went bananas when the concert began.",
            "I used to get butterflies in my stomach before the tests.",
            "Things quickly went south when my phone got hacked."]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])



['그 남자는 오늘 뿐만 아니라 어제도 그를 괴롭히고 있었습니다.', '나는 기분이 좋아.', '그는 1년 전에 새로운 사업을 시작했습니다. 제가 알기로는 그는 많은 돈을 벌었습니다.', '그에게 이상한 점이 있지만, 딱히 뭐라고 말을 못하겠어.', '그녀는 무엇이 문제를 일으키는지 몰랐지만, 마침내 그 원인을 찾아냈다.', '제가 유동적으로 조정할 것입니다.', '가슴이 두근두근합니다.', '콘서트가 시작했을 때 군중은 열광했습니다.', '시험 전에 너무 긴장해서 긴장을 많이 했어요.', '제 전화기가 해킹당했을 때 상황이 빠르게 악화되었습니다.']


# 3. Train KO to EN

## 3.1 Train

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # or "0,1" for multiple GPUs

In [4]:
with open("config_koen.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])

print("[LOG1] config yaml done")
metric = load_metric("sacrebleu")
print("[LOG2] metric done")
dset = load_dataset("csv", data_files={'train':'idioms__train.csv','valid': 'idioms__eval.csv'})
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
print("[LOG3] load_dataset and tokenizer loading done")

[LOG1] config yaml done


  metric = load_metric("sacrebleu")
Using the latest cached version of the module from /home/013907062/.cache/huggingface/modules/datasets_modules/metrics/sacrebleu/556ba16a9634185dd1ea68395e0e474d6ee4de7e123fa701d577c6461f06032b (last modified on Thu Sep 14 16:32:47 2023) since it couldn't be found locally at sacrebleu, or remotely on the Hugging Face Hub.


[LOG2] metric done
[LOG3] load_dataset and tokenizer loading done


In [6]:
CFG.model_name

'QuoQA-NLP/KE-T5-Ko2En-Base'

In [7]:
def preprocess_function(examples):
    inputs = examples[CFG.src_language]
    targets = examples[CFG.tgt_language]
    model_inputs = tokenizer(inputs, max_length=CFG.max_token_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=CFG.max_token_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


#tokenization...
tokenized_datasets = dset.map(preprocess_function, batched=True, num_proc=multiprocessing.cpu_count())
print("[LOG] tokenized_datasets done")

#load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name)

#for logging
str_model_name = CFG.model_name.split("/")[-1]
run_name = f"{str_model_name}-finetuned-{CFG.src_language}-to-{CFG.tgt_language}"
print("[LOG] run_name", run_name)

training_args = Seq2SeqTrainingArguments(
    run_name,
    learning_rate=CFG.learning_rate,
    weight_decay=CFG.weight_decay,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.valid_batch_size,
    evaluation_strategy=CFG.evaluation_strategy,
    # eval_steps=CFG.eval_steps,
    save_steps=CFG.save_steps,
    num_train_epochs=CFG.num_epochs,
    save_total_limit=CFG.num_checkpoints,
    predict_with_generate=True,
    fp16=CFG.fp16,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    logging_steps=CFG.logging_steps,
)

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print("[LOG YJ] Trainer Ready DONE!")

Map (num_proc=56): 100%|███████████████████████████████████████████████████████████████████| 2341/2341 [00:00<00:00, 4079.21 examples/s]
Map (num_proc=56): 100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 204.17 examples/s]


[LOG] tokenized_datasets done
[LOG] run_name KE-T5-Ko2En-Base-finetuned-ko-to-en
[LOG YJ] Trainer Ready DONE!


In [8]:
trainer.train()
trainer.evaluate()
trainer.save_model(CFG.save_path)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,No log,2.06961,20.6879,16.02
2,No log,1.929752,21.5927,15.91
2,No log,1.861832,23.5125,15.82
4,No log,1.814788,25.4053,15.72
4,No log,1.794654,26.5152,15.73
6,No log,1.785549,27.2129,15.59
6,No log,1.798384,28.2109,15.6
8,No log,1.790666,27.8897,15.53
8,1.603400,1.797298,28.187,15.53
9,1.603400,1.798962,28.3746,15.54


## 3.2 Evaluation

In [9]:
import pandas as pd

df_test = pd.read_csv("idioms__test.csv")
display(df_test.head())
src_text = df_test['ko'].values.tolist()

# Read config.yaml file
with open("config_koen.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])

print("[LOG] CFG" , CFG)

model_name = CFG.inference_model_name
result_path = CFG.save_path

tokenizer = AutoTokenizer.from_pretrained(result_path)
model = AutoModelForSeq2SeqLM.from_pretrained(result_path)

Unnamed: 0,en,ko
0,"Once upon a time, there were three beautiful b...","옛날 옛적에, 세 마리의 예쁜 나비가 있었어요."
1,I felt like I have millions butterflies in my ...,너무 긴장 한 것 같았어요.
2,The deal was completely open and above board.,거래는 완전히 공개되었고 명백했습니다.
3,I left the keys on the board on your porch.,현관 게시판에 열쇠를 두고 왔어요.
4,clean your toys in the living room. no buts!,어서 거실에 있는 장난감 정리하세요. 토 달지 말고!


[LOG] CFG {'DEBUG': False, 'train_batch_size': 64, 'valid_batch_size': 128, 'num_epochs': 10, 'num_checkpoints': 3, 'max_token_length': 512, 'stopwords': [], 'learning_rate': 0.0005, 'weight_decay': 0.01, 'adam_beta_1': 0.9, 'adam_beta_2': 0.98, 'epsilon': 1e-09, 'fp16': False, 'gradient_accumulation_steps': 2, 'save_steps': 150, 'logging_steps': 150, 'evaluation_strategy': 'epoch', 'inference_model_name': 'QuoQA-NLP/KE-T5-Ko2En-Base', 'no_inference_sentences': 100, 'num_beams': 5, 'repetition_penalty': 1.3, 'no_repeat_ngram_size': 3, 'num_return_sequences': 1, 'src_language': 'ko', 'tgt_language': 'en', 'model_name': 'QuoQA-NLP/KE-T5-Ko2En-Base', 'num_inference_sample': 120, 'dropout': 0.1, 'ROOT_PATH': '.', 'save_path': './results_Ko2En'}


In [10]:
src_text[0]

'옛날 옛적에, 세 마리의 예쁜 나비가 있었어요.'

In [11]:
translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding='max_length', max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)
#print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

In [12]:
output = []
for t in translated:
    output.append(tokenizer.decode(t, skip_special_tokens=True))
    
df_test['predictionsEn'] = output
df_test.to_csv("results_koen.csv")

In [13]:
## ===================================SIMPLE CHECK=========================================== ##

src_text = ['유유상종입니다.', '토 달지 말고 얼른 청소해!', '내 코가 석자라 도와 줄 수가 없네요', '진퇴양란이다.' , 
            '쥐구멍에도 볕 들 날 있다고, 우리 열심히 해 봅시다.', '영철이 완전 개천에서 용난 케이스야.', '식은 죽 먹기다.' ]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])



['Birds of a feather flock together.', "Don't cry over it, just clean it up!", "I have my own fish to fry, so I can't help you.", "It's between the devil and the deep blue sea.", "Every dog has his day, so let's try hard.", 'Yeongchul is a case of rags to riches.', "It's a piece of cake."]
