In [1]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM

config = AutoConfig.from_pretrained('vinai/bertweet-base')
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
model = AutoModelForMaskedLM.from_config(config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from glob import glob
from datasets import Dataset

import re
import pandas as pd

from parse import mask_data_loading

data_url = '../crawler/stock/data/**.json'
url = glob(data_url)[-1]
data, symbols = mask_data_loading(url, tokenizer)

dataset = Dataset.from_pandas(data.loc[:, ['labels', 'sentense']])
dataset = dataset.remove_columns('__index_level_0__')

special_tokens_dict = {'additional_special_tokens': list(symbols)}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(64101, 768)

In [3]:
# tokenizer.save_pretrained('./symbol-vocab')

In [4]:
def encode(example):
    sentense = example['sentense']
    label = example['labels']

    # label = tokenizer(label, padding=True, truncation=True)
    # 101, 51, 1234, 12541, 151
    result = tokenizer(sentense, padding=True, truncation=True)
    # 101, 103, 103, 103
    # result['label_ids'] = label['input_ids']

    # masked_position = [i for i in range(len(result['input_ids'])) if result['input_ids'][i] == tokenizer.mask_token_id]
    # result['decoder_input_ids'] = label['input_ids']
    # result['labels'] = [-100 for i in label['input_ids']]
    # for i in range(len(result['labels'])):
    #     if not i in masked_position:
    #         result['labels'][i] = -100

    return result

context_length = 128
def tokenize(element):
    outputs = tokenizer(
        element["sentense"],
        padding="max_length",
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length <= context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

encoded_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

# print(encoded_dataset[0]['sentense'])
# print(encoded_dataset[0]['input_ids'])
# print(encoded_dataset[0]['labels'])
# print(encoded_dataset[0]['label_ids'])
# print(tokenizer.ids_to_tokens)
# print(tokenizer.decode(encoded_dataset[0]['label_ids']))

100%|██████████| 1/1 [00:00<00:00,  1.57ba/s]


In [5]:
print(len(encoded_dataset['input_ids']))

598


In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.1)

samples = encoded_dataset[:2]
a = data_collator(encoded_dataset['input_ids'])
print(tokenizer.decode(a['input_ids'][0]))

<s> <mask> greeeennnn f u c k you Bears apple is <mask> beast </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


# Training

In [7]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
# Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="random-masked-trainer",
    per_device_train_batch_size=4,
    num_train_epochs=50,
    save_steps=10000
)

# train_datset = encoded_dataset.shuffle().select(range(4000))

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=encoded_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

***** Running training *****
  Num examples = 598
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 7500
  0%|          | 2/7500 [00:20<21:06:00, 10.13s/it]

KeyboardInterrupt: 

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0
)

s = f'{tokenizer.mask_token} to the sky!!!'
fill_mask(s)

In [None]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())