In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
import random
from copy import deepcopy
from tqdm.auto import trange

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train_path = "en_kr_data/train/ko2en_training_csv/ko2en_medical_1_training.csv"
val_path = "en_kr_data/val/ko2en_validation_csv/ko2en_medical_2_validation.csv" 
train_df = pd.read_csv(train_path, sep=',')
val_test_df = pd.read_csv(val_path, sep=',')
val_df, test_df = val_test_df[:len(val_test_df)//2], val_test_df[len(val_test_df)//2:]
print(len(train_df), len(val_df), len(test_df))

#sort sentences by length
#prune sentences over 20 words
#train set small
#get scores for model

200000 12500 12500


In [None]:
#kor_Hang

name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(name)
nllb = AutoModelForSeq2SeqLM.from_pretrained(name)

In [None]:
tokenizer.src_lang = "kor_Hang"
inputs = tokenizer(text="다만 하반기에도 이같은 기조가 이어질 지에 대해서는 의견이 분분하다.", return_tensors="pt")
translated_tokens = nllb.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
)
print(tokenizer.decode(translated_tokens[0], skip_special_tokens=True))



However, there is a difference of opinion as to whether this trend will continue in the second half.


In [None]:
tokenizer.src_lang = "kor_Hang"
inputs = tokenizer(text=train_df['한국어'][0], return_tensors="pt")
translated_tokens = nllb.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
)
print(tokenizer.decode(translated_tokens[0], skip_special_tokens=True))
print(train_df['영어'][0])



Although the wound is smaller than the surgical procedure and the risk of infection is lower, there is a limit to the complete removal of the placental endometrial components with a needle.
Although the risk of infection is lower as the wound is smaller than during the surgical method, but there is a limit to completely removing the thick and sticky endometriosis composition with a needle.


In [None]:
batch_size = 16
test_df_small = test_df#[:1000]
#batches = [test_df.iloc[i:i + batch_size] for i in range(0, len(test_df), batch_size)]
batches = [test_df_small.iloc[i:i + batch_size] for i in range(0, len(test_df_small), batch_size)]

english_translations = []

for df_batch in tqdm(batches):
    inputs = tokenizer(text=df_batch['한국어'].tolist(), return_tensors="pt", padding=True, truncation=True)
    translated_tokens = nllb.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
    )
    translated_sentences = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    #print(translated_sentences)
    english_translations += translated_sentences

df_result = pd.concat(batches, ignore_index=True)
#print(df_result)
df_result.head()

100%|██████████| 782/782 [1:37:58<00:00,  7.52s/it]  


Unnamed: 0,sid,분야,한국어,영어,한국어_어절수,영어_단어수,길이_분류,난이도,수행기관
0,212501,의료/보건,"김시몬 신천지 대변인은 지난 23일 유튜브 등 사회관계망서비스를 통해 ""코로나 19...",Shincheonji spokesman Kim Simon said through s...,24,42,4,상,에버트란
1,212502,의료/보건,종근당이 종합구충제 '젤콤'으로 온 가족 기생충을 한 번에 잡는다.,Chong Kun Dang catches a family parasite at on...,9,14,1,하,에버트란
2,212503,의료/보건,"정 본부장은 ""중국에 다녀오신 분들, 주로 의료계나 시설 종사자분들께는 업무 배제 ...","Director Jung said, ""We are making requests fo...",15,23,3,하,에버트란
3,212504,의료/보건,"법원이 간호사를 상대로 한 의사의 부적절한 발언을 성희롱으로 보고, 해당 의사는 물...",The court considered the doctor's inappropriat...,18,28,3,중,에버트란
4,212505,의료/보건,"피험자들의 연령과 인구통계학적 요소, 건강 관련 행동 등을 모두 고려해도 결과는 달...",The results did not change even if the subject...,13,18,2,하,에버트란


In [None]:
english_translations_split = [t.split() for t in english_translations]

english_sentences = test_df['영어'].tolist()#[:1000]
english_sentences_split = [[s.split()] for s in english_sentences]

print(corpus_bleu(english_sentences_split, english_translations_split))

0.1867908859829627


In [22]:
with open('nllb_baseline.txt', 'w') as of:
    for t in english_translations:
        of.write(t+'\n')

In [None]:
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup
#nllb.cuda();
optimizer = Adafactor(
    [p for p in nllb.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=100)

In [None]:
data_train = train_df.copy()
data_train = data_train.filter(['한국어', '영어'], axis=1)


In [None]:
LANGS = [('영어', 'en_Latn'), ('한국어', 'kor_Hang')]

def get_batch_pairs(batch_size, data=data_train[:1000]):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    x, y = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        x.append(item[l1])
        y.append(item[l2])
    return x, y, long1, long2

print(get_batch_pairs(1, data_train[:1000]))

(['또 이와 관련, 양국 관계자들이 모여 공동선언문을 발표할 예정이다.'], ['And regarding this, officials from the two countries will also gather to issue a joint declaration.'], 'kor_Hang', 'en_Latn')


In [None]:
batch_size = 16 
max_length = 60 
training_steps = 2000 
losses = [] 
MODEL_SAVE_PATH = '/models'

In [None]:
nllb_new = deepcopy(nllb)
nllb_new.train()
x, y, loss = None, None, None

tq = trange(len(losses), training_steps)
for i in tq:
    x, y, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(x, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(nllb_new.device)
        tokenizer.src_lang = lang2
        y = tokenizer(y, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(nllb_new.device)
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = nllb_new(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except: 
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        print('error', max(len(s) for s in x + y), e)
        continue

    if i % 1000 == 0:
        print(i, np.mean(losses[-1000:]))

    if i % 1000 == 0 and i > 0:
        nllb_new.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

  0%|          | 1/2000 [00:15<8:32:07, 15.37s/it]

0 2.1635966300964355


 10%|▉         | 190/2000 [1:13:34<17:24:52, 34.64s/it]