In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, BertweetTokenizer

config = AutoConfig.from_pretrained('vinai/bertweet-base')
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
model = AutoModelForMaskedLM.from_config(config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from glob import glob
from datasets import Dataset

import re
import pandas as pd

from parse import mask_data_loading

data_url = '../crawler/stock/data/**.json'
url = glob(data_url)[-1]
data, symbols = mask_data_loading(url, tokenizer)

dataset = Dataset.from_pandas(data.loc[:, ['labels', 'sentense']])
dataset = dataset.remove_columns('__index_level_0__')

special_tokens_dict = {'additional_special_tokens': list(symbols)}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(64614, 768)

In [4]:
tokenizer.save_pretrained('./symbol-vocab')

('./symbol-vocab\\tokenizer_config.json',
 './symbol-vocab\\special_tokens_map.json',
 './symbol-vocab\\vocab.txt',
 './symbol-vocab\\bpe.codes',
 './symbol-vocab\\added_tokens.json')

In [4]:
def encode(example):
    sentense = example['sentense']
    label = example['labels']

    label = tokenizer(label, padding='max_length', truncation=True)
    # 101, 51, 1234, 12541, 151
    result = tokenizer(sentense, padding='max_length', truncation=True)
    # 101, 103, 103, 103
    result['label_ids'] = label['input_ids']

    # masked_position = [i for i in range(len(result['input_ids'])) if result['input_ids'][i] == tokenizer.mask_token_id]
    # result['decoder_input_ids'] = label['input_ids']
    # result['labels'] = [-100 for i in label['input_ids']]
    # for i in range(len(result['labels'])):
    #     if not i in masked_position:
    #         result['labels'][i] = -100

    return result

encoded_dataset = dataset.map(encode, batched=True)

# print(encoded_dataset[0]['sentense'])
# print(encoded_dataset[0]['input_ids'])
# print(encoded_dataset[0]['labels'])
# print(encoded_dataset[0]['label_ids'])
# print(tokenizer.ids_to_tokens)
# print(tokenizer.decode(encoded_dataset[0]['label_ids']))

100%|██████████| 16/16 [00:08<00:00,  1.82ba/s]


In [5]:
from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# samples = encoded_dataset[:2]
# for chunk in data_collator(samples)["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")

# Training

In [6]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [7]:
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer

# training_args = TrainingArguments(
#     output_dir="test_trainer",
#     per_device_train_batch_size=2,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=encoded_dataset,
#     compute_metrics=compute_metrics,
# )

# Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=4,
    num_train_epochs=50,
    save_steps=10000
)

train_datset = encoded_dataset.shuffle().select(range(4000))

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_datset,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: sentense. If sentense are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4000
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 50000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malan8365[0m (use `wandb login --relogin` to force relogin)


  1%|          | 501/50000 [01:10<2:32:30,  5.41it/s]

{'loss': 2.2814, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.5}


  2%|▏         | 1001/50000 [02:17<2:29:30,  5.46it/s]

{'loss': 0.1828, 'learning_rate': 4.9e-05, 'epoch': 1.0}


  3%|▎         | 1501/50000 [03:24<2:26:57,  5.50it/s]

{'loss': 0.0865, 'learning_rate': 4.85e-05, 'epoch': 1.5}


  4%|▍         | 2001/50000 [04:31<2:25:08,  5.51it/s]

{'loss': 0.0747, 'learning_rate': 4.8e-05, 'epoch': 2.0}


  5%|▌         | 2501/50000 [05:39<2:25:30,  5.44it/s]

{'loss': 0.0427, 'learning_rate': 4.75e-05, 'epoch': 2.5}


  6%|▌         | 3001/50000 [06:45<2:22:00,  5.52it/s]

{'loss': 0.049, 'learning_rate': 4.7e-05, 'epoch': 3.0}


  7%|▋         | 3501/50000 [07:51<2:16:43,  5.67it/s]

{'loss': 0.0378, 'learning_rate': 4.6500000000000005e-05, 'epoch': 3.5}


  8%|▊         | 4001/50000 [08:55<2:15:40,  5.65it/s]

{'loss': 0.0331, 'learning_rate': 4.600000000000001e-05, 'epoch': 4.0}


  9%|▉         | 4501/50000 [10:00<2:13:30,  5.68it/s]

{'loss': 0.0274, 'learning_rate': 4.55e-05, 'epoch': 4.5}


 10%|█         | 5001/50000 [11:04<2:11:31,  5.70it/s]

{'loss': 0.0356, 'learning_rate': 4.5e-05, 'epoch': 5.0}


 11%|█         | 5501/50000 [12:08<2:12:27,  5.60it/s]

{'loss': 0.0253, 'learning_rate': 4.4500000000000004e-05, 'epoch': 5.5}


 12%|█▏        | 6001/50000 [13:13<2:09:41,  5.65it/s]

{'loss': 0.0316, 'learning_rate': 4.4000000000000006e-05, 'epoch': 6.0}


 13%|█▎        | 6501/50000 [14:17<2:06:15,  5.74it/s]

{'loss': 0.023, 'learning_rate': 4.35e-05, 'epoch': 6.5}


 14%|█▍        | 7001/50000 [15:22<2:06:38,  5.66it/s]

{'loss': 0.0237, 'learning_rate': 4.3e-05, 'epoch': 7.0}


 15%|█▌        | 7501/50000 [16:26<2:03:55,  5.72it/s]

{'loss': 0.0178, 'learning_rate': 4.25e-05, 'epoch': 7.5}


 16%|█▌        | 8001/50000 [17:31<2:02:55,  5.69it/s]

{'loss': 0.0214, 'learning_rate': 4.2e-05, 'epoch': 8.0}


 17%|█▋        | 8501/50000 [18:35<2:01:25,  5.70it/s]

{'loss': 0.0141, 'learning_rate': 4.15e-05, 'epoch': 8.5}


 18%|█▊        | 9001/50000 [19:39<2:02:40,  5.57it/s]

{'loss': 0.0177, 'learning_rate': 4.1e-05, 'epoch': 9.0}


 19%|█▉        | 9501/50000 [20:44<1:58:27,  5.70it/s]

{'loss': 0.0114, 'learning_rate': 4.05e-05, 'epoch': 9.5}


 20%|██        | 10000/50000 [21:48<2:08:57,  5.17it/s]Saving model checkpoint to test_trainer\checkpoint-10000
Configuration saved in test_trainer\checkpoint-10000\config.json


{'loss': 0.0133, 'learning_rate': 4e-05, 'epoch': 10.0}


Model weights saved in test_trainer\checkpoint-10000\pytorch_model.bin
 21%|██        | 10501/50000 [23:00<1:56:02,  5.67it/s] 

{'loss': 0.0104, 'learning_rate': 3.9500000000000005e-05, 'epoch': 10.5}


 22%|██▏       | 11001/50000 [24:04<1:55:01,  5.65it/s]

{'loss': 0.0092, 'learning_rate': 3.9000000000000006e-05, 'epoch': 11.0}


 23%|██▎       | 11501/50000 [25:09<1:53:28,  5.65it/s]

{'loss': 0.0072, 'learning_rate': 3.85e-05, 'epoch': 11.5}


 24%|██▍       | 12001/50000 [26:13<1:51:51,  5.66it/s]

{'loss': 0.0089, 'learning_rate': 3.8e-05, 'epoch': 12.0}


 25%|██▌       | 12501/50000 [27:17<1:48:47,  5.74it/s]

{'loss': 0.0067, 'learning_rate': 3.7500000000000003e-05, 'epoch': 12.5}


 26%|██▌       | 13001/50000 [28:22<1:48:50,  5.67it/s]

{'loss': 0.0065, 'learning_rate': 3.7e-05, 'epoch': 13.0}


 27%|██▋       | 13501/50000 [29:26<1:47:40,  5.65it/s]

{'loss': 0.0055, 'learning_rate': 3.65e-05, 'epoch': 13.5}


 28%|██▊       | 14001/50000 [30:31<1:44:56,  5.72it/s]

{'loss': 0.0058, 'learning_rate': 3.6e-05, 'epoch': 14.0}


 29%|██▉       | 14501/50000 [31:35<1:45:08,  5.63it/s]

{'loss': 0.0047, 'learning_rate': 3.55e-05, 'epoch': 14.5}


 30%|███       | 15001/50000 [32:39<1:44:14,  5.60it/s]

{'loss': 0.0041, 'learning_rate': 3.5e-05, 'epoch': 15.0}


 31%|███       | 15501/50000 [33:44<1:42:08,  5.63it/s]

{'loss': 0.0035, 'learning_rate': 3.45e-05, 'epoch': 15.5}


 32%|███▏      | 16001/50000 [34:48<1:40:28,  5.64it/s]

{'loss': 0.0038, 'learning_rate': 3.4000000000000007e-05, 'epoch': 16.0}


 33%|███▎      | 16501/50000 [35:53<1:39:10,  5.63it/s]

{'loss': 0.0031, 'learning_rate': 3.35e-05, 'epoch': 16.5}


 34%|███▍      | 17001/50000 [36:57<1:37:17,  5.65it/s]

{'loss': 0.0031, 'learning_rate': 3.3e-05, 'epoch': 17.0}


 35%|███▌      | 17501/50000 [38:02<1:34:51,  5.71it/s]

{'loss': 0.0021, 'learning_rate': 3.2500000000000004e-05, 'epoch': 17.5}


 36%|███▌      | 18001/50000 [39:06<1:34:44,  5.63it/s]

{'loss': 0.0031, 'learning_rate': 3.2000000000000005e-05, 'epoch': 18.0}


 37%|███▋      | 18501/50000 [40:10<1:32:20,  5.69it/s]

{'loss': 0.0024, 'learning_rate': 3.15e-05, 'epoch': 18.5}


 38%|███▊      | 19001/50000 [41:15<1:33:53,  5.50it/s]

{'loss': 0.0025, 'learning_rate': 3.1e-05, 'epoch': 19.0}


 39%|███▉      | 19501/50000 [42:19<1:29:10,  5.70it/s]

{'loss': 0.0024, 'learning_rate': 3.05e-05, 'epoch': 19.5}


 40%|████      | 20000/50000 [43:23<1:36:51,  5.16it/s]Saving model checkpoint to test_trainer\checkpoint-20000
Configuration saved in test_trainer\checkpoint-20000\config.json


{'loss': 0.0024, 'learning_rate': 3e-05, 'epoch': 20.0}


Model weights saved in test_trainer\checkpoint-20000\pytorch_model.bin
 41%|████      | 20501/50000 [44:35<1:26:11,  5.70it/s] 

{'loss': 0.0018, 'learning_rate': 2.95e-05, 'epoch': 20.5}


 42%|████▏     | 21001/50000 [45:39<1:24:26,  5.72it/s]

{'loss': 0.0023, 'learning_rate': 2.9e-05, 'epoch': 21.0}


 43%|████▎     | 21501/50000 [46:44<1:23:50,  5.66it/s]

{'loss': 0.0025, 'learning_rate': 2.8499999999999998e-05, 'epoch': 21.5}


 44%|████▍     | 22001/50000 [47:48<1:23:13,  5.61it/s]

{'loss': 0.0019, 'learning_rate': 2.8000000000000003e-05, 'epoch': 22.0}


 45%|████▌     | 22501/50000 [48:52<1:21:59,  5.59it/s]

{'loss': 0.0015, 'learning_rate': 2.7500000000000004e-05, 'epoch': 22.5}


 46%|████▌     | 23001/50000 [49:57<1:20:20,  5.60it/s]

{'loss': 0.002, 'learning_rate': 2.7000000000000002e-05, 'epoch': 23.0}


 47%|████▋     | 23501/50000 [51:01<1:17:42,  5.68it/s]

{'loss': 0.0012, 'learning_rate': 2.6500000000000004e-05, 'epoch': 23.5}


 48%|████▊     | 24001/50000 [52:05<1:16:47,  5.64it/s]

{'loss': 0.0019, 'learning_rate': 2.6000000000000002e-05, 'epoch': 24.0}


 49%|████▉     | 24501/50000 [53:10<1:14:29,  5.71it/s]

{'loss': 0.0013, 'learning_rate': 2.5500000000000003e-05, 'epoch': 24.5}


 50%|█████     | 25001/50000 [54:14<1:13:20,  5.68it/s]

{'loss': 0.0018, 'learning_rate': 2.5e-05, 'epoch': 25.0}


 51%|█████     | 25501/50000 [55:19<1:11:41,  5.70it/s]

{'loss': 0.0016, 'learning_rate': 2.45e-05, 'epoch': 25.5}


 52%|█████▏    | 26001/50000 [56:23<1:12:08,  5.54it/s]

{'loss': 0.0014, 'learning_rate': 2.4e-05, 'epoch': 26.0}


 53%|█████▎    | 26501/50000 [57:28<1:09:41,  5.62it/s]

{'loss': 0.0016, 'learning_rate': 2.35e-05, 'epoch': 26.5}


 54%|█████▍    | 27001/50000 [58:32<1:07:22,  5.69it/s]

{'loss': 0.0011, 'learning_rate': 2.3000000000000003e-05, 'epoch': 27.0}


 55%|█████▌    | 27501/50000 [59:36<1:07:06,  5.59it/s]

{'loss': 0.0011, 'learning_rate': 2.25e-05, 'epoch': 27.5}


 56%|█████▌    | 28001/50000 [1:00:41<1:05:14,  5.62it/s]

{'loss': 0.0015, 'learning_rate': 2.2000000000000003e-05, 'epoch': 28.0}


 57%|█████▋    | 28501/50000 [1:01:45<1:03:16,  5.66it/s]

{'loss': 0.0011, 'learning_rate': 2.15e-05, 'epoch': 28.5}


 58%|█████▊    | 29001/50000 [1:02:50<1:01:50,  5.66it/s]

{'loss': 0.0019, 'learning_rate': 2.1e-05, 'epoch': 29.0}


 59%|█████▉    | 29501/50000 [1:03:54<59:45,  5.72it/s]  

{'loss': 0.0011, 'learning_rate': 2.05e-05, 'epoch': 29.5}


 60%|██████    | 30000/50000 [1:04:58<1:05:08,  5.12it/s]Saving model checkpoint to test_trainer\checkpoint-30000
Configuration saved in test_trainer\checkpoint-30000\config.json


{'loss': 0.0014, 'learning_rate': 2e-05, 'epoch': 30.0}


Model weights saved in test_trainer\checkpoint-30000\pytorch_model.bin
 61%|██████    | 30501/50000 [1:06:11<57:16,  5.67it/s]   

{'loss': 0.0011, 'learning_rate': 1.9500000000000003e-05, 'epoch': 30.5}


 62%|██████▏   | 31001/50000 [1:07:15<55:53,  5.67it/s]  

{'loss': 0.0013, 'learning_rate': 1.9e-05, 'epoch': 31.0}


 63%|██████▎   | 31501/50000 [1:08:20<53:51,  5.73it/s]

{'loss': 0.001, 'learning_rate': 1.85e-05, 'epoch': 31.5}


 64%|██████▍   | 32001/50000 [1:09:24<52:51,  5.67it/s]

{'loss': 0.0013, 'learning_rate': 1.8e-05, 'epoch': 32.0}


 65%|██████▌   | 32501/50000 [1:10:28<51:16,  5.69it/s]

{'loss': 0.0011, 'learning_rate': 1.75e-05, 'epoch': 32.5}


 66%|██████▌   | 33001/50000 [1:11:33<49:51,  5.68it/s]

{'loss': 0.0011, 'learning_rate': 1.7000000000000003e-05, 'epoch': 33.0}


 67%|██████▋   | 33501/50000 [1:12:37<48:54,  5.62it/s]

{'loss': 0.0009, 'learning_rate': 1.65e-05, 'epoch': 33.5}


 68%|██████▊   | 34001/50000 [1:13:42<47:15,  5.64it/s]

{'loss': 0.0012, 'learning_rate': 1.6000000000000003e-05, 'epoch': 34.0}


 69%|██████▉   | 34501/50000 [1:14:46<45:26,  5.68it/s]

{'loss': 0.001, 'learning_rate': 1.55e-05, 'epoch': 34.5}


 70%|███████   | 35001/50000 [1:15:50<43:50,  5.70it/s]

{'loss': 0.0012, 'learning_rate': 1.5e-05, 'epoch': 35.0}


 71%|███████   | 35501/50000 [1:16:55<42:49,  5.64it/s]

{'loss': 0.0012, 'learning_rate': 1.45e-05, 'epoch': 35.5}


 72%|███████▏  | 36001/50000 [1:17:59<41:31,  5.62it/s]

{'loss': 0.0009, 'learning_rate': 1.4000000000000001e-05, 'epoch': 36.0}


 73%|███████▎  | 36501/50000 [1:19:03<40:10,  5.60it/s]

{'loss': 0.001, 'learning_rate': 1.3500000000000001e-05, 'epoch': 36.5}


 74%|███████▍  | 37001/50000 [1:20:08<38:32,  5.62it/s]

{'loss': 0.001, 'learning_rate': 1.3000000000000001e-05, 'epoch': 37.0}


 75%|███████▌  | 37501/50000 [1:21:12<36:51,  5.65it/s]

{'loss': 0.0008, 'learning_rate': 1.25e-05, 'epoch': 37.5}


 76%|███████▌  | 38001/50000 [1:22:16<35:09,  5.69it/s]

{'loss': 0.0012, 'learning_rate': 1.2e-05, 'epoch': 38.0}


 77%|███████▋  | 38501/50000 [1:23:21<33:31,  5.72it/s]

{'loss': 0.0012, 'learning_rate': 1.1500000000000002e-05, 'epoch': 38.5}


 78%|███████▊  | 39001/50000 [1:24:25<32:13,  5.69it/s]

{'loss': 0.0008, 'learning_rate': 1.1000000000000001e-05, 'epoch': 39.0}


 79%|███████▉  | 39501/50000 [1:25:30<30:57,  5.65it/s]

{'loss': 0.0009, 'learning_rate': 1.05e-05, 'epoch': 39.5}


 80%|████████  | 40000/50000 [1:26:34<32:24,  5.14it/s]Saving model checkpoint to test_trainer\checkpoint-40000
Configuration saved in test_trainer\checkpoint-40000\config.json


{'loss': 0.0009, 'learning_rate': 1e-05, 'epoch': 40.0}


Model weights saved in test_trainer\checkpoint-40000\pytorch_model.bin
 81%|████████  | 40501/50000 [1:27:46<28:01,  5.65it/s]  

{'loss': 0.0008, 'learning_rate': 9.5e-06, 'epoch': 40.5}


 82%|████████▏ | 41001/50000 [1:28:50<26:50,  5.59it/s]

{'loss': 0.001, 'learning_rate': 9e-06, 'epoch': 41.0}


 83%|████████▎ | 41501/50000 [1:29:55<24:46,  5.72it/s]

{'loss': 0.001, 'learning_rate': 8.500000000000002e-06, 'epoch': 41.5}


 84%|████████▍ | 42001/50000 [1:30:59<23:37,  5.64it/s]

{'loss': 0.0007, 'learning_rate': 8.000000000000001e-06, 'epoch': 42.0}


 85%|████████▌ | 42501/50000 [1:32:04<22:01,  5.67it/s]

{'loss': 0.0009, 'learning_rate': 7.5e-06, 'epoch': 42.5}


 86%|████████▌ | 43001/50000 [1:33:08<20:32,  5.68it/s]

{'loss': 0.0009, 'learning_rate': 7.000000000000001e-06, 'epoch': 43.0}


 87%|████████▋ | 43501/50000 [1:34:12<19:00,  5.70it/s]

{'loss': 0.0009, 'learning_rate': 6.5000000000000004e-06, 'epoch': 43.5}


 88%|████████▊ | 44001/50000 [1:35:17<17:45,  5.63it/s]

{'loss': 0.0008, 'learning_rate': 6e-06, 'epoch': 44.0}


 89%|████████▉ | 44501/50000 [1:36:21<16:14,  5.64it/s]

{'loss': 0.0007, 'learning_rate': 5.500000000000001e-06, 'epoch': 44.5}


 90%|█████████ | 45001/50000 [1:37:25<14:40,  5.68it/s]

{'loss': 0.0009, 'learning_rate': 5e-06, 'epoch': 45.0}


 91%|█████████ | 45501/50000 [1:38:30<13:17,  5.64it/s]

{'loss': 0.0007, 'learning_rate': 4.5e-06, 'epoch': 45.5}


 92%|█████████▏| 46001/50000 [1:39:34<11:46,  5.66it/s]

{'loss': 0.0009, 'learning_rate': 4.000000000000001e-06, 'epoch': 46.0}


 93%|█████████▎| 46501/50000 [1:40:39<10:14,  5.69it/s]

{'loss': 0.0009, 'learning_rate': 3.5000000000000004e-06, 'epoch': 46.5}


 94%|█████████▍| 47001/50000 [1:41:43<09:00,  5.55it/s]

{'loss': 0.0007, 'learning_rate': 3e-06, 'epoch': 47.0}


 95%|█████████▌| 47501/50000 [1:42:48<07:22,  5.65it/s]

{'loss': 0.0008, 'learning_rate': 2.5e-06, 'epoch': 47.5}


 96%|█████████▌| 48001/50000 [1:43:52<05:51,  5.69it/s]

{'loss': 0.0008, 'learning_rate': 2.0000000000000003e-06, 'epoch': 48.0}


 97%|█████████▋| 48501/50000 [1:44:57<04:23,  5.69it/s]

{'loss': 0.0007, 'learning_rate': 1.5e-06, 'epoch': 48.5}


 98%|█████████▊| 49001/50000 [1:46:01<02:56,  5.67it/s]

{'loss': 0.0009, 'learning_rate': 1.0000000000000002e-06, 'epoch': 49.0}


 99%|█████████▉| 49501/50000 [1:47:06<01:35,  5.24it/s]

{'loss': 0.0007, 'learning_rate': 5.000000000000001e-07, 'epoch': 49.5}


100%|██████████| 50000/50000 [1:48:14<00:00,  4.92it/s]Saving model checkpoint to test_trainer\checkpoint-50000
Configuration saved in test_trainer\checkpoint-50000\config.json


{'loss': 0.0009, 'learning_rate': 0.0, 'epoch': 50.0}


Model weights saved in test_trainer\checkpoint-50000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 50000/50000 [1:48:24<00:00,  7.69it/s]

{'train_runtime': 6508.0232, 'train_samples_per_second': 30.731, 'train_steps_per_second': 7.683, 'train_loss': 0.03217216126203537, 'epoch': 50.0}





TrainOutput(global_step=50000, training_loss=0.03217216126203537, metrics={'train_runtime': 6508.0232, 'train_samples_per_second': 30.731, 'train_steps_per_second': 7.683, 'train_loss': 0.03217216126203537, 'epoch': 50.0})

In [9]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0
)

s = f'{tokenizer.mask_token} to the sky!!!'
fill_mask(s)

[{'score': 0.999998927116394,
  'token': 64110,
  'token_str': '$ T S L A',
  'sequence': 'to the sky!!!'},
 {'score': 5.828806237673234e-08,
  'token': 64082,
  'token_str': '$ M S F T',
  'sequence': 'to the sky!!!'},
 {'score': 3.953035232484581e-08,
  'token': 64174,
  'token_str': '$ A A P L',
  'sequence': 'to the sky!!!'},
 {'score': 3.7064779689899296e-08,
  'token': 3,
  'token_str': '< u n k >',
  'sequence': 'to the sky!!!'},
 {'score': 3.131322046101559e-08,
  'token': 64560,
  'token_str': '$ P L T R',
  'sequence': 'to the sky!!!'}]

In [None]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())