In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM

config = AutoConfig.from_pretrained('vinai/bertweet-base')
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
# model = AutoModelForMaskedLM.from_config(config)
model = AutoModelForMaskedLM.from_pretrained('vinai/bertweet-base')

In [None]:
from glob import glob
from datasets import Dataset

import re
import pandas as pd

from parse import mask_data_loading

data_url = '../crawler/stock/data/**.json'
url = glob(data_url)[-1]
data, symbols = mask_data_loading(url, tokenizer, symbol_mask=True)

dataset = Dataset.from_pandas(data.loc[:, ['labels', 'sentense']])
dataset = dataset.remove_columns('__index_level_0__')
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset['train']
test_dataset = dataset['test']
# dataset = dataset.shuffle().select(range(50000))

special_tokens_dict = {'additional_special_tokens': list(symbols)}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# tokenizer.save_pretrained('./symbol-vocab')

In [None]:
def encode(example):
    sentense = example['sentense']
    label = example['labels']

    result = tokenizer(sentense, padding=True, truncation=True)

    return result

context_length = 128
def tokenize(element):
    outputs = tokenizer(
        element["sentense"],
        padding="max_length",
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length <= context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

encoded_train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=train_dataset.column_names)
encoded_test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=test_dataset.column_names)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.1)

samples = encoded_train_dataset[:2]
a = data_collator(samples['input_ids'])
print(tokenizer.decode(a['input_ids'][0]))

# Training

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
# Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="after-bert-random-symbol-trainer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    save_steps=50000
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0
)

s = f'{tokenizer.mask_token} to the sky!!!'
fill_mask(s)

In [None]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())