In [None]:
import os
import torch
import transformers
from transformers import AutoModelWithLMHead, PreTrainedTokenizerFast
from fastai.text.all import *
from tqdm.notebook import tqdm
import fastai
import re

In [None]:
print(torch.__version__)
print(transformers.__version__)
print(fastai.__version__)

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PRETRAINED_MODEL = 'skt/kogpt2-base-v2'
BATCH_SIZE = 8
SEQ_LENGTH = 256
MAX_LENGTH = 128
DATASET_PATH = './dataset/jjaltoon_scripts_10_raw/'

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    PRETRAINED_MODEL,
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>',
    bos_token='<s>',
    eos_token='</s>'
)
model = AutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)

In [None]:
tokenizer.tokenize('GPT-2 토크나이저 테스트. 안녕하세요.')

In [None]:
text = '오늘의 메뉴는'
input_ids = tokenizer.encode(text)
gen_ids = model.generate(
    torch.tensor([input_ids]),
    max_length=MAX_LENGTH,
    repetition_penalty=2.0,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True
)
generated = tokenizer.decode(gen_ids[0,:].tolist())
generated

In [None]:
texts = []

file_names = os.listdir(DATASET_PATH)
for file_name in tqdm(file_names, desc='input data files'):
    file_path = os.path.join(DATASET_PATH, file_name)
    with open(file_path, mode='r', encoding='utf-8') as file:
        file_content = file.read()
    texts.append(' '.join(file_content.split()))

data = ' '.join(texts)
len(data)

In [None]:
# Normalization
#data = re.sub('\(계속\).*?[●○]', '', data)
#data = re.sub('[●○]', '', data)
#len(data)

In [None]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def encodes(self, x):
        tokens = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(tokens))
    
    def decodes(self, x):
        return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [None]:
train_data = data[:int(len(data) * 0.9)]
test_data = data[int(len(data) * 0.9):]
splits = [[0], [1]]

In [None]:
tls = TfmdLists([train_data, test_data], TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)
dls = tls.dataloaders(bs=BATCH_SIZE, seq_len=SEQ_LENGTH)
dls.show_batch(max_n=2)

In [None]:
class DropOutput(Callback):
    def after_pred(self):
        self.learn.pred = self.pred[0]

In [None]:
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), cbs=[DropOutput], metrics=Perplexity()).to_fp16()
learn.lr_find()

In [None]:
#learn.unfreeze()
learn.fit_one_cycle(50)

In [None]:
def generate_sequence(prompt):
    prompt_ids = tokenizer.encode(prompt)
    inp = tensor(prompt_ids)[None].cuda()
    preds = learn.model.generate(
        inp,
        max_length=MAX_LENGTH,
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True
    )
    return tokenizer.decode(preds[0].cpu().numpy())

In [None]:
generate_sequence('인공지능')