In [2]:
!conda env list

# conda environments:
#
VAdepthENV               /home/013907062/.conda/envs/VAdepthENV
env_onmttf               /home/013907062/.conda/envs/env_onmttf
koen_base                /home/013907062/.conda/envs/koen_base
newDepth                 /home/013907062/.conda/envs/newDepth
test                     /home/013907062/.conda/envs/test
wmt_infer             *  /home/013907062/.conda/envs/wmt_infer
base                     /opt/ohpc/pub/apps/anaconda/3.9
stylegan2                /opt/ohpc/pub/apps/anaconda/3.9/envs/stylegan2



In [3]:
from transformers import AutoTokenizer, MarianMTModel, AutoTokenizer, AutoModelForSeq2SeqLM
from easydict import EasyDict
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
import pandas as pd

df_test = pd.read_csv("idioms__test.csv")
display(df_test.head())
df_test_en_list = df_test['en'].values.tolist()

Unnamed: 0,en,ko
0,"Once upon a time, there were three beautiful b...","옛날 옛적에, 세 마리의 예쁜 나비가 있었어요."
1,I felt like I have millions butterflies in my ...,너무 긴장 한 것 같았어요.
2,The deal was completely open and above board.,거래는 완전히 공개되었고 명백했습니다.
3,I left the keys on the board on your porch.,현관 게시판에 열쇠를 두고 왔어요.
4,clean your toys in the living room. no buts!,어서 거실에 있는 장난감 정리하세요. 토 달지 말고!


In [24]:
# Read config.yaml file
with open("config.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])

src_text = df_test_en_list
result_path = "./results/"

tokenizer = AutoTokenizer.from_pretrained(result_path)
model = AutoModelForSeq2SeqLM.from_pretrained(result_path)

In [25]:
translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

#print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])



In [28]:
output = []
for t in translated:
    output.append(tokenizer.decode(t, skip_special_tokens=True))
    
df_test['predictions'] = output
df_test.to_csv("results.csv")

## TEST

In [14]:
src_text = ["The man had egg on him today as well as yesterday.",
            "I am peachy",
           "He started new business one year ago. As I know it, he has made a lot of dough.",
           "There's something odd about him, but I can't quite put my finger on it.",
           "She didn’t know what was causing the problem, but she finally put her finger on it."]

In [15]:
translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

['그 남자는 어제뿐만 아니라 오늘에도 그에게 의심했습니다.', '나는 기분이 좋아.', '그는 1년 전에 새로운 사업을 시작했습니다. 제가 알고 있는 바에 따르면, 그는 많은 돈을 벌었습니다.', '그에게 이상한 점이 있지만, 저는 그 점에 대해 잘 알 수 없습니다.', '그녀는 무엇이 문제를 일으키는지 몰랐지만, 마침내 그 문제에 손을 들었다.']


In [16]:
src_text = ["I will play it by ear.",
            "I've got butterflies in my stomach.",
            "The crowd went bananas when the concert began.",
            "I used to get butterflies in my stomach before the tests.",
            "Things quickly went south when my phone got hacked."]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

['나는 그것을 알아서 할 거야.', '저는 가슴이 두근두근합니다.', '콘서트가 시작되자 관중들은 열광했습니다.', '나는 시험 전에 기분이 좋아졌습니다.', '내 전화기가 해킹됐을 때 상황이 빠르게 악화되었습니다.']


# Comparison

## En Ko

In [29]:
with open("config_enko.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])
    

model_name = "QuoQA-NLP/KE-T5-En2Ko-Base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

src_text = ["I will play it by ear.",
            "I've got butterflies in my stomach.",
            "The crowd went bananas when the concert began.",
            "I used to get butterflies in my stomach before the tests.",
            "Things quickly went south when my phone got hacked."]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])



['나는 그것을 귀로 연주할 것입니다.', '저는 배에 나비가 생겼어요.', '콘서트가 시작되자 관중들은 바나나를 먹었다.', '나는 시험 전에 배에서 나비를 당하기도 했어요.', '내 핸드폰이 해킹을 당하자 상황이 빠르게 남갔다.']


In [30]:
model_name = "./KE-T5-En2Ko-Base-finetuned-en-to-ko/checkpoint-150/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

src_text = ["I will play it by ear.",
            "I've got butterflies in my stomach.",
            "The crowd went bananas when the concert began.",
            "I used to get butterflies in my stomach before the tests.",
            "Things quickly went south when my phone got hacked."]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

['제가 유동적으로 조정할 것입니다.', '가슴이 두근두근합니다.', '콘서트가 시작되었을 때 군중은 열광했습니다.', '시험 전에 너무 떨렸어.', '제 전화기가 해킹당했을 때 상황이 빠르게 악화되었습니다.']


In [33]:
model_name = "./results_enko_e10/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

src_text = ["I will play it by ear.",
            "I've got butterflies in my stomach.",
            "The crowd went bananas when the concert began.",
            "I used to get butterflies in my stomach before the tests.",
            "Things quickly went south when my phone got hacked.",
            "we went south to meet family."]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

['제가 유동적으로 조정할 것입니다.', '가슴이 두근두근합니다.', '콘서트가 시작했을 때 군중은 열광했습니다.', '시험 전에 너무 긴장해서 긴장을 많이 했어요.', '제 전화기가 해킹당했을 때 상황이 빠르게 악화되었습니다.', '우리는 가족을 만나기 위해 남쪽으로 갔어.']


## Ko En

In [6]:
# Read config.yaml file
with open("config_koen.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    CFG = EasyDict(SAVED_CFG["CFG"])

model_name = "QuoQA-NLP/KE-T5-Ko2En-Base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

src_text = ['유유상종입니다.', '토 달지 말고 얼른 청소해!', '내 코가 석자라 도와 줄 수가 없네요', '진퇴양란이다.' , 
            '쥐구멍에도 볕 들 날 있다고, 우리 열심히 해 봅시다.', '영철이 완전 개천에서 용난 케이스야.', '식은 죽 먹기다.' ]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])



["It's Yuyusangjong.", "Don't fill it up and clean it up quickly!", "I can't help you because my nose is a stone.", "It's a dysphagia.", "There is a sun in the mouse hole, let's try hard.", "It's a case where Yeongchul is in a full stream.", 'Food is eating porridge.']


In [7]:
model_name = "./results_Ko2En"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

src_text = ['유유상종입니다.', '토 달지 말고 얼른 청소해!', '내 코가 석자라 도와 줄 수가 없네요', '진퇴양란이다.' , 
            '쥐구멍에도 볕 들 날 있다고, 우리 열심히 해 봅시다.', '영철이 완전 개천에서 용난 케이스야.', '식은 죽 먹기다.' ]

translated = model.generate(
    **tokenizer(src_text, return_tensors="pt", padding=True, max_length=CFG.max_token_length,),
    max_length=CFG.max_token_length,
    num_beams=CFG.num_beams,
    repetition_penalty=CFG.repetition_penalty,
    no_repeat_ngram_size=CFG.no_repeat_ngram_size,
    num_return_sequences=CFG.num_return_sequences,
)

print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

['Birds of a feather flock together.', "Don't cry over it, just clean it up!", "I have my own fish to fry, so I can't help you.", "It's between the devil and the deep blue sea.", "Every dog has his day, so let's try hard.", 'Yeongchul is a case of rags to riches.', "It's a piece of cake."]
