In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, BertweetTokenizer

config = AutoConfig.from_pretrained('vinai/bertweet-base')
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
model = AutoModelForMaskedLM.from_config(config)

In [None]:
from glob import glob
from datasets import Dataset

import re
import pandas as pd

from parse import mask_data_loading

data_url = '../crawler/stock/data/**.json'
url = glob(data_url)[-1]
data, symbols = mask_data_loading(url, tokenizer)

dataset = Dataset.from_pandas(data.loc[:, ['labels', 'sentense']])
dataset = dataset.remove_columns('__index_level_0__')

special_tokens_dict = {'additional_special_tokens': list(symbols)}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [None]:
tokenizer.save_pretrained('./symbol-vocab')

In [None]:
def encode(example):
    sentense = example['sentense']
    label = example['labels']

    label = tokenizer(label, padding='max_length', truncation=True)
    # 101, 51, 1234, 12541, 151
    result = tokenizer(sentense, padding='max_length', truncation=True)
    # 101, 103, 103, 103
    result['label_ids'] = label['input_ids']

    # masked_position = [i for i in range(len(result['input_ids'])) if result['input_ids'][i] == tokenizer.mask_token_id]
    # result['decoder_input_ids'] = label['input_ids']
    # result['labels'] = [-100 for i in label['input_ids']]
    # for i in range(len(result['labels'])):
    #     if not i in masked_position:
    #         result['labels'][i] = -100

    return result

encoded_dataset = dataset.map(encode, batched=True)

# print(encoded_dataset[0]['sentense'])
# print(encoded_dataset[0]['input_ids'])
# print(encoded_dataset[0]['labels'])
# print(encoded_dataset[0]['label_ids'])
# print(tokenizer.ids_to_tokens)
# print(tokenizer.decode(encoded_dataset[0]['label_ids']))

# Training

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer

# training_args = TrainingArguments(
#     output_dir="test_trainer",
#     per_device_train_batch_size=2,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=encoded_dataset,
#     compute_metrics=compute_metrics,
# )

# Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=4,
    num_train_epochs=50,
    save_steps=10000
)

# train_datset = encoded_dataset.shuffle().select(range(4000))

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0
)

s = f'{tokenizer.mask_token} to the sky!!!'
fill_mask(s)

In [None]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())