In [1]:
!conda env list

# conda environments:
#
VAdepthENV               /home/013907062/.conda/envs/VAdepthENV
cmpe249                  /home/013907062/.conda/envs/cmpe249
env_onmttf               /home/013907062/.conda/envs/env_onmttf
hawaii                   /home/013907062/.conda/envs/hawaii
hawaii_hf                /home/013907062/.conda/envs/hawaii_hf
koen_base                /home/013907062/.conda/envs/koen_base
newDepth                 /home/013907062/.conda/envs/newDepth
seamless                 /home/013907062/.conda/envs/seamless
test                     /home/013907062/.conda/envs/test
wmt_infer             *  /home/013907062/.conda/envs/wmt_infer
base                     /opt/ohpc/pub/apps/anaconda/3.9
stylegan2                /opt/ohpc/pub/apps/anaconda/3.9/envs/stylegan2



In [2]:
import pandas as pd
import numpy as np
import multiprocessing
from easydict import EasyDict
import yaml
from datasets import load_dataset, load_metric, Dataset

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm


# DATA

In [80]:
# dataset
raw_train_1 = pd.read_csv("data_train_1.csv")
display(raw_train_1.head())
raw_train_1.info()
raw_train_1[raw_train_1['ko'].isnull()].index.tolist()

Unnamed: 0,Idioms,en,ko
0,No buts.,You're grounded. No buts.,외출금지니까 토 달지마.
1,,Finish your meal. No buts.,잔말말고 다 먹어.
2,,Do your homework. No buts.,어서가서 숙제해. 예외는 없는 거야.
3,,No buts!,토 달지 말고!
4,,Clean your room. No buts.,청소해. 토 달지 말고.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2444 entries, 0 to 2443
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Idioms  36 non-null     object
 1   en      2444 non-null   object
 2   ko      2444 non-null   object
dtypes: object(3)
memory usage: 57.4+ KB


[]

In [81]:
raw_train_2 = pd.read_csv("data_train_2.csv")
display(raw_train_2.head())
raw_train_2.info()
#raw_train_2[raw_train_2['ko'].isnull()].index.tolist()

Unnamed: 0,Idioms,en,ko
0,A bad break,"Tony has lost his job, just when he needed the...",Tony는 집을 이사하기 위해 현금이 필요할 때 직장을 잃었습니다. 그것은 운이 나...
1,,Tanya’s leg was crushed when the rock fell on ...,Tanya의 다리는 바위가 떨어지자 부서졌습니다. 의사는 정말 운이 나빴다고 말했습니다.
2,,Some may say that I have been given a bad brea...,누군가는 내가 삶에서 불운을 받아왔다고 말할지도 모릅니다.
3,A bad hair day,"Do we talk about a bad hair day? In my case, I...",일진이 사나운 날에 대해서 얘기해 볼까요? 저 같은 경우는 버스를 놓쳐서 사장님이 ...
4,,It's a bad hair day!,오늘 정말 일진이 안 좋네!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 791 entries, 0 to 790
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Idioms  196 non-null    object
 1   en      791 non-null    object
 2   ko      791 non-null    object
dtypes: object(3)
memory usage: 18.7+ KB


In [82]:
raw_train = pd.concat([raw_train_1, raw_train_2])
display(raw_train.head())
raw_train.info()

Unnamed: 0,Idioms,en,ko
0,No buts.,You're grounded. No buts.,외출금지니까 토 달지마.
1,,Finish your meal. No buts.,잔말말고 다 먹어.
2,,Do your homework. No buts.,어서가서 숙제해. 예외는 없는 거야.
3,,No buts!,토 달지 말고!
4,,Clean your room. No buts.,청소해. 토 달지 말고.


<class 'pandas.core.frame.DataFrame'>
Index: 3235 entries, 0 to 790
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Idioms  232 non-null    object
 1   en      3235 non-null   object
 2   ko      3235 non-null   object
dtypes: object(3)
memory usage: 101.1+ KB


In [83]:
raw_train.to_csv("a_data_train.csv", index=False)
column_list = ['en', 'ko']
raw_train.to_csv('data_train.csv', columns = column_list,index=False)

In [39]:
idioms_train_en = raw_train['en'].tolist()
idioms_train_ko = raw_train['ko'].tolist()
print( len(idioms_train_en) , len(idioms_train_ko) )

3235 3235


In [29]:
raw_test = pd.read_csv("data_test.csv")
display(raw_test.head())
raw_test.info()
#raw_test[raw_test['ko'].isnull()].index.tolist()

Unnamed: 0,en,ko
0,"Once upon a time, there were three beautiful b...","옛날 옛적에, 세 마리의 예쁜 나비가 있었어요."
1,I felt like I have millions butterflies in my ...,너무 긴장 한 것 같았어요.
2,The deal was completely open and above board.,거래는 완전히 공개되었고 명백했습니다.
3,I left the keys on the board on your porch.,현관 게시판에 열쇠를 두고 왔어요.
4,clean your toys in the living room. no buts!,어서 거실에 있는 장난감 정리하세요. 토 달지 말고!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en      277 non-null    object
 1   ko      277 non-null    object
dtypes: object(2)
memory usage: 4.5+ KB


In [31]:
column_list = ['en', 'ko']
raw_test[:5].to_csv('data_test_5.csv', columns = column_list,index=False)

In [29]:
idioms_test_en = raw_test['en'].tolist()
idioms_test_ko = raw_test['ko'].tolist()
print( len(idioms_test_en) , len(idioms_test_ko) )

277 277


# BASELINE

In [45]:
import pandas as pd

# EN =======> KO

df_test = pd.read_csv("data_test.csv")
display(df_test.head())
src_text = df_test['en'].values.tolist()

with open("config_enko.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])

#EN KO
model_name = "QuoQA-NLP/KE-T5-En2Ko-Base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding='max_length', max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

output = []
for t in translated:
    output.append(tokenizer.decode(t, skip_special_tokens=True))
    
df_test['Results_baseline_enko'] = output

# KO =======> EN

src_text = df_test['ko'].values.tolist()

'''
with open("config_koen.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])
'''

model_name = "QuoQA-NLP/KE-T5-Ko2En-Base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding='max_length', max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

output = []
for t in translated:
    output.append(tokenizer.decode(t, skip_special_tokens=True))
    
df_test['Results_baseline_koen'] = output

## save file
df_test.to_csv("Results_baseline.csv")

Unnamed: 0,en,ko
0,"Once upon a time, there were three beautiful b...","옛날 옛적에, 세 마리의 예쁜 나비가 있었어요."
1,I felt like I have millions butterflies in my ...,너무 긴장 한 것 같았어요.
2,The deal was completely open and above board.,거래는 완전히 공개되었고 명백했습니다.
3,I left the keys on the board on your porch.,현관 게시판에 열쇠를 두고 왔어요.
4,clean your toys in the living room. no buts!,어서 거실에 있는 장난감 정리하세요. 토 달지 말고!


In [57]:
df_base = pd.read_csv("Results_baseline.csv")
preds_enko = df_base["Results_baseline_enko"].tolist()
preds_koen = df_base["Results_baseline_koen"].tolist()
labels_ko = np.expand_dims(df_base["ko"], axis=1).tolist()
labels_en = np.expand_dims(df_base["en"], axis=1).tolist()
display(df_base.head())

Unnamed: 0.1,Unnamed: 0,en,ko,Results_baseline_enko,Results_baseline_koen
0,0,"Once upon a time, there were three beautiful b...","옛날 옛적에, 세 마리의 예쁜 나비가 있었어요.",옛날에는 세 마리의 아름다운 나비가 있었다.,"In the old days, there were three pretty butte..."
1,1,I felt like I have millions butterflies in my ...,너무 긴장 한 것 같았어요.,나는 위에 수백만 마리의 나비가 있는 것 같았어.,I felt so nervous.
2,2,The deal was completely open and above board.,거래는 완전히 공개되었고 명백했습니다.,그 거래는 완전히 개방되었고 이사회 위에 있었다.,The transaction was completely disclosed and c...
3,3,I left the keys on the board on your porch.,현관 게시판에 열쇠를 두고 왔어요.,나는 당신의 porch에 키를 보드에 두고 왔어요.,I left the key on the front door bulletin board.
4,4,clean your toys in the living room. no buts!,어서 거실에 있는 장난감 정리하세요. 토 달지 말고!,거실에 있는 장난감을 깨끗하게 청소하세요! 안녕!,"Let's clean up the toys in the living room, do..."


In [60]:
df_base = pd.read_csv("Results_baseline.csv")
preds_enko = df_base["Results_baseline_enko"].tolist()
preds_koen = df_base["Results_baseline_koen"].tolist()
labels_ko = np.expand_dims(df_base["ko"], axis=1).tolist()
labels_en = np.expand_dims(df_base["en"], axis=1).tolist()
display(df_base.head())

import sacrebleu
score = sacrebleu.corpus_bleu(preds_enko, labels_ko, tokenize="none", lowercase=True)
print(score)
score = sacrebleu.corpus_bleu(preds_koen, labels_en, tokenize="none", lowercase=True)
print(score)

BLEU = 19.30 66.7/20.0/12.5/8.3 (BP = 1.000 ratio = 1.000 hyp_len = 6 ref_len = 6)
BLEU = 26.08 77.8/50.0/14.3/8.3 (BP = 1.000 ratio = 1.000 hyp_len = 9 ref_len = 9)


In [75]:
df_base = pd.read_csv("Results_baseline.csv")
preds_enko = df_base["Results_baseline_enko"]
preds_koen = df_base["Results_baseline_koen"]
labels_ko = np.expand_dims(df_base["ko"], axis=1)
labels_en = np.expand_dims(df_base["en"], axis=1)
display(df_base.head())

score = metric.compute(predictions=preds_enko, references=labels_ko)
print(score)
score = metric.compute(predictions=preds_koen, references=labels_en)
print(score)

Unnamed: 0.1,Unnamed: 0,en,ko,Results_baseline_enko,Results_baseline_koen
0,0,"Once upon a time, there were three beautiful b...","옛날 옛적에, 세 마리의 예쁜 나비가 있었어요.",옛날에는 세 마리의 아름다운 나비가 있었다.,"In the old days, there were three pretty butte..."
1,1,I felt like I have millions butterflies in my ...,너무 긴장 한 것 같았어요.,나는 위에 수백만 마리의 나비가 있는 것 같았어.,I felt so nervous.
2,2,The deal was completely open and above board.,거래는 완전히 공개되었고 명백했습니다.,그 거래는 완전히 개방되었고 이사회 위에 있었다.,The transaction was completely disclosed and c...
3,3,I left the keys on the board on your porch.,현관 게시판에 열쇠를 두고 왔어요.,나는 당신의 porch에 키를 보드에 두고 왔어요.,I left the key on the front door bulletin board.
4,4,clean your toys in the living room. no buts!,어서 거실에 있는 장난감 정리하세요. 토 달지 말고!,거실에 있는 장난감을 깨끗하게 청소하세요! 안녕!,"Let's clean up the toys in the living room, do..."


{'score': 9.367361940458151, 'counts': [972, 299, 134, 55], 'totals': [2733, 2456, 2179, 1902], 'precisions': [35.56531284302964, 12.174267100977199, 6.1496099128040385, 2.891692954784437], 'bp': 1.0, 'sys_len': 2733, 'ref_len': 2696}
{'score': 17.91036792930159, 'counts': [1831, 809, 418, 219], 'totals': [3724, 3447, 3170, 2893], 'precisions': [49.16756176154672, 23.46968378299971, 13.186119873817034, 7.569996543380574], 'bp': 0.9721982944214319, 'sys_len': 3724, 'ref_len': 3829}


# TRAIN EN - KO

In [None]:
!python trainenko.py

# TRAIN KO-EN

In [None]:
!python trainkoen.py

# TEST STUFF

In [71]:
sample_set = load_dataset("csv", data_files={'test': 'data_test_5.csv'})
tokenizer = AutoTokenizer.from_pretrained("QuoQA-NLP/KE-T5-En2Ko-Base", local_files_only=True)

with open("config_enko.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])
    
def preprocess_function(examples):
    inputs = examples[CFG.src_language]
    targets = examples[CFG.tgt_language]
    model_inputs = tokenizer(inputs, max_length=CFG.max_token_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=CFG.max_token_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

if tokenizer:
    tokenized_datasets = sample_set.map(
        preprocess_function,
        batched=True,
        num_proc=1,
        remove_columns=sample_set["test"].column_names,
    )#The default batch size is 1000, but you can adjust it with the batch_size argument
    tokenized_datasets.set_format("torch")
    
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})

In [43]:
model = AutoModelForSeq2SeqLM.from_pretrained(CFG.model_name, local_files_only=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

batch = data_collator([tokenized_datasets["test"][i] for i in range(1, 3)])
print(batch.keys()) #(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])


In [64]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [67]:
from torch.utils.data import DataLoader
from tqdm import tqdm

eval_dataloader = DataLoader(tokenized_datasets["test"], collate_fn=data_collator, batch_size=1)
model.eval()
for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        #batch = {k: v.to(device) for k, v in batch.items()}
        generated_tokens = model.generate(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=128,
        )
    labels = batch["labels"]
    decoded_preds, decoded_labels = postprocess(generated_tokens, labels)
    metric.add_batch(predictions=decoded_preds, references=decoded_labels)

results = metric.compute()
print(f"BLEU score: {results['score']:.2f}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.14s/it]

BLEU score: 6.56



