In [1]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    PreTrainedTokenizerFast
)

import nltk
from datetime import datetime

from typing import Optional

In [2]:
class MyTokenizer(PreTrainedTokenizerFast):
    def __init__(self,*args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
        return (1,)
    

In [3]:
model_name = 'gogamza/kobart-base-v2'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer = MyTokenizer.from_pretrained(model_name, legacy_format=False)

In [72]:
encoder_max_length = 256
decoder_max_length = 64

In [73]:
import os
import json
import pandas as pd
from pandas import json_normalize

json_data = []

for filename in os.listdir("korea_data"):
   with open(os.path.join("korea_data", filename), 'r') as f:
       json_data.append(json.load(f))

In [74]:
df = pd.concat([json_normalize(json_data[i]['data']) for i in range(len(json_data))])

In [75]:
json_data[0].keys()

dict_keys(['numberOfItems', 'data'])

In [76]:
del_list = [ 'header.dialogueInfo.numberOfParticipants',
       'header.dialogueInfo.numberOfUtterances',
       'header.dialogueInfo.numberOfTurns', 'header.dialogueInfo.type',
       'header.dialogueInfo.topic', 'header.participantsInfo']

In [77]:
df.iloc[0].keys()

Index(['header.dialogueInfo.dialogueID',
       'header.dialogueInfo.numberOfParticipants',
       'header.dialogueInfo.numberOfUtterances',
       'header.dialogueInfo.numberOfTurns', 'header.dialogueInfo.type',
       'header.dialogueInfo.topic', 'header.participantsInfo', 'body.dialogue',
       'body.summary'],
      dtype='object')

In [78]:
import datasets.arrow_dataset as da

In [79]:
data = da.Dataset.from_pandas(df)

In [80]:
change_df = df.drop(del_list, axis=1)

In [81]:
change_df.columns=["id",  "dialogue", "summary"]

In [82]:
dataset[0]

{'__index_level_0__': 0,
 'dialogue': 'P01: 수유로오셈^^\r\nP02: 지금?\r\nP02: 지금가도대?\r\nP01: 지금이겠냐?\r\nP01: 되겠냐?\r\nP01: 집 데려다주냐?\r\nP02: 왜안댐?\r\nP02: ㅋㅋㅋㅋ\r\nP02: 너가 나데려다줘야지\r\nP02: 내가 수유가는데',
 'summary': '지금 수유에 갈 테니 집에 데려다 달라고 한다.'}

In [83]:
len(dataset)

35004

In [84]:
len(change_df)

35004

In [85]:
change_df

Unnamed: 0,id,dialogue,summary
0,71e6e6d6-b33d-530a-b930-f5f30a5c5b2b,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",지금 수유에 갈 테니 집에 데려다 달라고 한다.
1,9370a6e1-36e8-567c-bad2-1ffd4ffe88b7,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",2박 3일 정도 쉬는 날을 몰아서 제주도에 갔다 오기로 했다.
2,0a5a7dff-e535-53a9-988b-d607b82dbb62,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",메가박스 매표소에서 2시 45분에 만나기로 약속했다.
3,5d1e0030-9237-5403-bf23-dfe88c7c0947,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",내일 상황을 봐서 정하자고 카페를 가서 힐링해야 한다고 한다.
4,23b693b7-9326-5ed0-b3d1-a16dfc6bc001,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",모레 김장 김치와 수육을 먹을 테니 집으로 오라고 한다.
...,...,...,...
1635,0c1b7107-d6fd-5c36-a27f-25b3a575c174,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",유명한 회화 영어학원이 생각이 모자라고 행동이 어리석다.
1636,ec655c80-1e69-518d-9b53-5c5fc3f34a24,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",수학 문제를 푸는데 문제가 이상해서 답이 다르게 나온다고 한다.
1637,cebecc9b-fb3f-5aab-9476-de0dcd448ef1,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",수요일에 시험 끝나고 도서관에 가서 공부를 하자고 하니 도서관은 숨이 안 쉬어져서 ...
1638,d98d7690-24ba-5dfa-b57d-7d79cad81749,"[{'utteranceID': 'U1', 'turnID': 'T1', 'partic...",열심히 하고는 있지만 공부할 시간이 부족해 괴로워하고 있다.


In [86]:
import datasets.arrow_dataset as da

In [87]:
data = da.Dataset.from_pandas(change_df)

In [88]:
def flatten(example):
    
    dialogue_list = []

    for dict_data in example['dialogue']:
        return_string = ""
        for string in dict_data:
            return_string += string['participantID']+": "+string['utterance']+"\r\n"
        
        dialogue_list.append(return_string[:-2])

    return{
        "dialogue" : dialogue_list,
        "summary" : example['summary']
    }

In [89]:
dataset = data.map(flatten, remove_columns=['id'], batched = True)
dataset[0]

HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))




{'__index_level_0__': 0,
 'dialogue': 'P01: 수유로오셈^^\r\nP02: 지금?\r\nP02: 지금가도대?\r\nP01: 지금이겠냐?\r\nP01: 되겠냐?\r\nP01: 집 데려다주냐?\r\nP02: 왜안댐?\r\nP02: ㅋㅋㅋㅋ\r\nP02: 너가 나데려다줘야지\r\nP02: 내가 수유가는데',
 'summary': '지금 수유에 갈 테니 집에 데려다 달라고 한다.'}

In [90]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["dialogue"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [91]:
train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.05).values()

train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [92]:
nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")

In [93]:
!pip install kss

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [94]:
import kss
import wandb
from tqdm import tqdm

In [95]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    # rougeLSum expects newline after each sentence
    preds = ["\n".join(kss.split_sentences(pred)) for pred in tqdm(preds)]
    labels = ["\n".join(kss.split_sentences(label)) for label in tqdm(labels)]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
wandb.init(project="example", entity="miml", name= 'korean')

training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=3,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    gradient_accumulation_steps = 4,
    learning_rate=5e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
    fp16=True,
    fp16_opt_level= 'O1',
    evaluation_strategy= 'steps',
    eval_steps = 2000
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/gen_len,▁█▄
eval/loss,█▁▁
eval/rouge1,▁█▇
eval/rouge2,▁▆█
eval/rougeL,▁█▇
eval/rougeLsum,▁█▇
eval/runtime,▁██
eval/samples_per_second,█▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
eval/gen_len,18.6294
eval/loss,3.40822
eval/rouge1,9.3009
eval/rouge2,1.7971
eval/rougeL,9.188
eval/rougeLsum,9.2143
eval/runtime,2978.482
eval/samples_per_second,0.588
train/epoch,3.0
train/global_step,6234.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
trainer.train()

In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["dialogue"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [None]:
summaries_after_tuning = generate_summary(validation_data_txt, model)[1]

In [None]:
summaries_after_tuning[0]

In [None]:
summaries_after_tuning[104]

In [None]:
validation_data_txt[104]