# Corpus Handling

## before run this script, make sure you initiate submodule(dataset) by
```git submodule update```


### *original repository of dataset is (https://github.com/jungyeul/korean-parallel-corpora)*

In [None]:
import tarfile
tar_name = "./dataset/korean-english-news-v1/korean-english-park.dev.tar.gz"

In [None]:
tar = tarfile.open(tar_name, "r:gz")
tar.extractall()
tar.close()

# Load Corpus and preprocessing

In [None]:
import numpy as np
from tqdm import tqdm
from collections import defaultdict

import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from konlpy.tag import Mecab

mecab = Mecab()


In [None]:
corpus_en = open("korean-english-park.dev.en", "r").read().splitlines()
corpus_ko = open("korean-english-park.dev.ko", "r").read().splitlines()

In [None]:
def preprocess_kor(kl):
    """make preprocessed korean morpheme list from korean sentence
    Args:
        kl : korean sentence
    Return:
        preprocessed korean morpheme list
    """
    kl = kl.lower().strip()
    kl = re.sub(r'[" "]+', " ", kl)
    kl = re.sub(r"[^가-힣0-9?.!,¿]+", " ", kl)
    kl = kl.rstrip().strip()
    kl = mecab.morphs(kl)
    return kl

In [None]:
def preprocess_eng(el):
    """make preprocessed english word list from english sentence
    Args:
        el : english sentence
    Return:
        preprocessed english word list
    """
    el = el.lower().strip()
    el = re.sub(r'[" "]+', " ", el)
    el = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", el)
    el = el.rstrip().strip()
    el = word_tokenize(el)
    el = el + ["<EOS>"]
    return el

In [None]:
%%time
tokens_eng = list(map(preprocess_eng,corpus_en))                   
tokens_kor = list(map(preprocess_kor,corpus_ko))

In [None]:
tokens_eng[0] , tokens_kor[0]

# Make Vacabualry Dictionary

In [None]:
wordcnt_ko = defaultdict(lambda:0)
for text_kor in tqdm(tokens_kor):
    for tokens in text_kor:
        wordcnt_ko[tokens] +=1

In [None]:
wordcnt_en = defaultdict(lambda:0)
for text_eng in tqdm(tokens_eng):
    for tokens in text_eng:
        wordcnt_en[tokens] +=1

In [None]:
dict_kor = {k + 3: v for k,v in enumerate(wordcnt_ko.keys())}
dict_kor[0] = "<PAD>"
dict_kor[1] = "<UNK>"
dict_kor[2] = "<BOS>"

In [None]:
dict_kor_inv = {v:k for k,v in dict_kor.items()}

In [None]:
dict_kor_inv["<BOS>"]

In [None]:
dict_eng = {k + 4: v for k,v in enumerate(wordcnt_en.keys())}
dict_eng[0] = "<PAD>"
dict_eng[1] = "<UNK>"
dict_eng[2] = "<BOS>"
dict_eng[3] = "<EOS>"
dict_eng_inv = {v:k for k,v in dict_eng.items()}

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.distplot(list(map(len,tokens_kor)))

In [None]:
sns.distplot(list(map(len,tokens_eng)))

In [None]:
maxlen_kor = max(list(map(len,tokens_kor)))
maxlen_eng = max(list(map(len,tokens_eng))) + 1 # target language needs additional space for <BOS>
maxlen_kor, maxlen_eng

# Transform Vocabulary to Index

In [None]:
def tokens2indexes_kor(tokens, maxlen=maxlen_kor):
    ret = np.zeros([maxlen], dtype=np.int32)
    for i in range(min(maxlen, len(tokens))):
        if tokens[i] in dict_kor_inv:
            ret[i] = dict_kor_inv[tokens[i]]
        else:
            ret[i] = dict_kor_inv["<UNK>"]
    return ret

In [None]:
array_kor = list(map(tokens2indexes_kor, tokens_kor))
array_kor = np.array(array_kor)

In [None]:
def tokens2indexes_eng(tokens, maxlen=maxlen_eng):
    ret = np.zeros([maxlen], dtype=np.int32)
    for i in range(min(maxlen, len(tokens))):
        if tokens[i] in dict_eng_inv:
            ret[i] = dict_eng_inv[tokens[i]]
        else:
            ret[i] = dict_eng_inv["<UNK>"]
    return ret

In [None]:
array_eng = list(map(tokens2indexes_eng, tokens_eng))
array_eng = np.array(array_eng)

In [None]:
array_eng

# Dataset Prepare

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(array_kor, array_eng, test_size=0.2, random_state=42)

# Model loading

In [None]:
import tensorflow as tf
tf.enable_eager_execution()

In [None]:
from transformer import Transformer
import modules

In [None]:
transformer_model = Transformer(100, 4, 0.5, maxlen_kor, maxlen_eng, len(dict_kor), len(dict_eng), 2, learning_rate=3e-3, device_name="gpu:0")

In [None]:
transformer_model(tf.convert_to_tensor(array_kor[:1]), tf.convert_to_tensor(array_eng[:1]), True)
transformer_model.summary(line_length=100, positions=[.70, .80, .90, 1.])

In [None]:
# if you get OOM Error, try reduce batch_size
transformer_model.fit(X_train, y_train , X_val, y_val, bos_index=dict_eng_inv["<BOS>"], batch_size=32, tqdm_option="normal", epochs=10)

# inference

In [None]:
# start token for initial target tensor
start_token = tf.convert_to_tensor(np.expand_dims(tokens2indexes_eng(["<BOS>"]), 0))
start_token

In [None]:
input_seq = X_train[1234:1235] # 1 batch
seq = input_seq[0]
[dict_kor[idx] for idx in seq]

In [None]:
target_seq = y_train[1234:1235]
seq = target_seq[0]
" ".join([dict_eng[idx] for idx in seq])

In [None]:
input_seq = tf.convert_to_tensor(input_seq)

In [None]:
target_seq_shifted = tf.pad(target_seq, [[0, 0], [1, 0]], constant_values=dict_eng_inv["<BOS>"])[:, :-1]

In [None]:
# auto encoding check
logit = transformer_model(input_seq, target_seq_shifted, False)
indexes = tf.argmax(logit, axis=2).numpy()[0]
" ".join([dict_eng[idx] for idx in indexes])

In [None]:
# beam search
beams = transformer_model.predict(input_seq, start_token, dict_eng_inv["<EOS>"], beam_cnt=3)


In [None]:
for prob, beam in beams:
    seq = beam.numpy()[0]
    print(" ".join([dict_eng[idx] for idx in seq if not (idx ==2 or idx==0 or idx ==3)]))
    print()

In [None]:
# beam search
beams = transformer_model.predict(input_seq, start_token, dict_eng_inv["<EOS>"], beam_cnt=3)


In [None]:
for prob, beam in beams:
    seq = beam.numpy()[0]
    print(" ".join([dict_eng[idx] for idx in seq if not (idx ==2 or idx==0 or idx ==3)]))
    print()