In [1]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, BertweetTokenizer

config = AutoConfig.from_pretrained('vinai/bertweet-base')
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
# model = AutoModelForMaskedLM.from_config(config)
model = AutoModelForMaskedLM.from_pretrained('vinai/bertweet-base')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from glob import glob
from datasets import Dataset

import re
import pandas as pd

from parse import mask_data_loading

data_url = '../crawler/stock/data/**.json'
url = glob(data_url)[-1]
data, symbols = mask_data_loading(url, tokenizer)

dataset = Dataset.from_pandas(data.loc[:, ['labels', 'sentense']])
dataset = dataset.remove_columns('__index_level_0__')
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset['train']
test_dataset = dataset['test']

train_dataset = train_dataset.shuffle().select(range(50000))

special_tokens_dict = {'additional_special_tokens': list(symbols)}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(65862, 768)

In [3]:
tokenizer.save_pretrained('./symbol-vocab')

('./symbol-vocab\\tokenizer_config.json',
 './symbol-vocab\\special_tokens_map.json',
 './symbol-vocab\\vocab.txt',
 './symbol-vocab\\bpe.codes',
 './symbol-vocab\\added_tokens.json')

In [4]:
def encode(example):
    sentense = example['sentense']
    label = example['labels']

    label = tokenizer(label, padding='max_length', truncation=True)
    # 101, 51, 1234, 12541, 151
    result = tokenizer(sentense, padding='max_length', truncation=True)
    # 101, 103, 103, 103
    result['label_ids'] = label['input_ids']

    # masked_position = [i for i in range(len(result['input_ids'])) if result['input_ids'][i] == tokenizer.mask_token_id]
    # result['decoder_input_ids'] = label['input_ids']
    # result['labels'] = [-100 for i in label['input_ids']]
    # for i in range(len(result['labels'])):
    #     if not i in masked_position:
    #         result['labels'][i] = -100

    return result

encoded_train_dataset = train_dataset.map(encode, batched=True)
encoded_test_dataset = test_dataset.map(encode, batched=True)

# print(encoded_dataset[0]['sentense'])
# print(encoded_dataset[0]['input_ids'])
# print(encoded_dataset[0]['labels'])
# print(encoded_dataset[0]['label_ids'])
# print(tokenizer.ids_to_tokens)
# print(tokenizer.decode(encoded_dataset[0]['label_ids']))

100%|██████████| 50/50 [00:55<00:00,  1.11s/ba]
100%|██████████| 28/28 [00:29<00:00,  1.06s/ba]


# Training

In [5]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="after-bert-trainer",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    save_steps=50000
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: sentense. If sentense are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 50000
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 62500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malan8365[0m (use `wandb login --relogin` to force relogin)


  1%|          | 501/62500 [01:06<3:07:59,  5.50it/s]

{'loss': 0.3378, 'learning_rate': 4.96e-05, 'epoch': 0.04}


  2%|▏         | 1001/62500 [02:11<3:13:03,  5.31it/s]

{'loss': 0.0689, 'learning_rate': 4.92e-05, 'epoch': 0.08}


  2%|▏         | 1501/62500 [03:16<3:05:27,  5.48it/s]

{'loss': 0.5251, 'learning_rate': 4.88e-05, 'epoch': 0.12}


  3%|▎         | 2001/62500 [04:21<3:15:56,  5.15it/s]

{'loss': 0.2287, 'learning_rate': 4.8400000000000004e-05, 'epoch': 0.16}


  4%|▍         | 2501/62500 [05:25<3:01:33,  5.51it/s]

{'loss': 0.6331, 'learning_rate': 4.8e-05, 'epoch': 0.2}


  5%|▍         | 3001/62500 [06:30<2:59:42,  5.52it/s]

{'loss': 0.2761, 'learning_rate': 4.76e-05, 'epoch': 0.24}


  6%|▌         | 3501/62500 [07:34<3:00:02,  5.46it/s]

{'loss': 0.2942, 'learning_rate': 4.72e-05, 'epoch': 0.28}


  6%|▋         | 4001/62500 [08:39<2:56:33,  5.52it/s]

{'loss': 0.1822, 'learning_rate': 4.6800000000000006e-05, 'epoch': 0.32}


  7%|▋         | 4501/62500 [09:43<2:56:00,  5.49it/s]

{'loss': 0.2308, 'learning_rate': 4.64e-05, 'epoch': 0.36}


  8%|▊         | 5001/62500 [10:48<2:55:12,  5.47it/s]

{'loss': 0.1576, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.4}


  9%|▉         | 5501/62500 [11:52<2:54:45,  5.44it/s]

{'loss': 0.1349, 'learning_rate': 4.5600000000000004e-05, 'epoch': 0.44}


 10%|▉         | 6001/62500 [12:57<2:51:32,  5.49it/s]

{'loss': 0.1106, 'learning_rate': 4.52e-05, 'epoch': 0.48}


 10%|█         | 6501/62500 [14:02<2:49:44,  5.50it/s]

{'loss': 0.1056, 'learning_rate': 4.4800000000000005e-05, 'epoch': 0.52}


 11%|█         | 7001/62500 [15:06<2:48:20,  5.49it/s]

{'loss': 0.1296, 'learning_rate': 4.44e-05, 'epoch': 0.56}


 12%|█▏        | 7501/62500 [16:11<2:46:05,  5.52it/s]

{'loss': 0.2057, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.6}


 13%|█▎        | 8001/62500 [17:15<2:45:00,  5.50it/s]

{'loss': 0.0865, 'learning_rate': 4.36e-05, 'epoch': 0.64}


 14%|█▎        | 8501/62500 [18:19<2:47:35,  5.37it/s]

{'loss': 0.0902, 'learning_rate': 4.32e-05, 'epoch': 0.68}


 14%|█▍        | 9001/62500 [19:24<2:41:27,  5.52it/s]

{'loss': 0.0972, 'learning_rate': 4.2800000000000004e-05, 'epoch': 0.72}


 15%|█▌        | 9501/62500 [20:28<2:40:58,  5.49it/s]

{'loss': 0.0684, 'learning_rate': 4.24e-05, 'epoch': 0.76}


 16%|█▌        | 10001/62500 [21:33<2:39:23,  5.49it/s]

{'loss': 0.0745, 'learning_rate': 4.2e-05, 'epoch': 0.8}


 17%|█▋        | 10501/62500 [22:37<2:38:01,  5.48it/s]

{'loss': 0.0759, 'learning_rate': 4.16e-05, 'epoch': 0.84}


 18%|█▊        | 11001/62500 [23:41<2:38:19,  5.42it/s]

{'loss': 0.0915, 'learning_rate': 4.12e-05, 'epoch': 0.88}


 18%|█▊        | 11501/62500 [24:46<2:35:38,  5.46it/s]

{'loss': 0.0943, 'learning_rate': 4.08e-05, 'epoch': 0.92}


 19%|█▉        | 12001/62500 [25:50<2:33:31,  5.48it/s]

{'loss': 0.0998, 'learning_rate': 4.0400000000000006e-05, 'epoch': 0.96}


 20%|██        | 12501/62500 [26:55<2:33:48,  5.42it/s]

{'loss': 0.0994, 'learning_rate': 4e-05, 'epoch': 1.0}


 21%|██        | 13001/62500 [27:59<2:29:39,  5.51it/s]

{'loss': 0.1065, 'learning_rate': 3.960000000000001e-05, 'epoch': 1.04}


 22%|██▏       | 13501/62500 [29:04<2:28:45,  5.49it/s]

{'loss': 0.0833, 'learning_rate': 3.9200000000000004e-05, 'epoch': 1.08}


 22%|██▏       | 14001/62500 [30:08<2:26:30,  5.52it/s]

{'loss': 0.0867, 'learning_rate': 3.88e-05, 'epoch': 1.12}


 23%|██▎       | 14501/62500 [31:12<2:24:43,  5.53it/s]

{'loss': 0.1039, 'learning_rate': 3.8400000000000005e-05, 'epoch': 1.16}


 24%|██▍       | 15001/62500 [32:17<2:23:35,  5.51it/s]

{'loss': 0.1218, 'learning_rate': 3.8e-05, 'epoch': 1.2}


 25%|██▍       | 15501/62500 [33:21<2:22:24,  5.50it/s]

{'loss': 0.0917, 'learning_rate': 3.76e-05, 'epoch': 1.24}


 26%|██▌       | 16001/62500 [34:26<2:20:40,  5.51it/s]

{'loss': 0.1404, 'learning_rate': 3.72e-05, 'epoch': 1.28}


 26%|██▋       | 16501/62500 [35:30<2:20:36,  5.45it/s]

{'loss': 0.0917, 'learning_rate': 3.68e-05, 'epoch': 1.32}


 27%|██▋       | 17001/62500 [36:35<2:18:11,  5.49it/s]

{'loss': 0.095, 'learning_rate': 3.6400000000000004e-05, 'epoch': 1.36}


 28%|██▊       | 17501/62500 [37:39<2:16:40,  5.49it/s]

{'loss': 0.1028, 'learning_rate': 3.6e-05, 'epoch': 1.4}


 29%|██▉       | 18001/62500 [38:44<2:15:01,  5.49it/s]

{'loss': 0.0507, 'learning_rate': 3.56e-05, 'epoch': 1.44}


 30%|██▉       | 18501/62500 [39:48<2:13:45,  5.48it/s]

{'loss': 0.0433, 'learning_rate': 3.52e-05, 'epoch': 1.48}


 30%|███       | 19001/62500 [40:52<2:11:56,  5.49it/s]

{'loss': 0.0347, 'learning_rate': 3.48e-05, 'epoch': 1.52}


 31%|███       | 19501/62500 [41:57<2:10:55,  5.47it/s]

{'loss': 0.0402, 'learning_rate': 3.4399999999999996e-05, 'epoch': 1.56}


 32%|███▏      | 20001/62500 [43:01<2:12:43,  5.34it/s]

{'loss': 0.0363, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.6}


 33%|███▎      | 20501/62500 [44:06<2:07:04,  5.51it/s]

{'loss': 0.0422, 'learning_rate': 3.3600000000000004e-05, 'epoch': 1.64}


 34%|███▎      | 21001/62500 [45:10<2:06:47,  5.45it/s]

{'loss': 0.0541, 'learning_rate': 3.32e-05, 'epoch': 1.68}


 34%|███▍      | 21501/62500 [46:15<2:05:38,  5.44it/s]

{'loss': 0.0965, 'learning_rate': 3.2800000000000004e-05, 'epoch': 1.72}


 35%|███▌      | 22001/62500 [47:19<2:03:55,  5.45it/s]

{'loss': 0.1172, 'learning_rate': 3.24e-05, 'epoch': 1.76}


 36%|███▌      | 22501/62500 [48:24<2:02:21,  5.45it/s]

{'loss': 0.0976, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.8}


 37%|███▋      | 23001/62500 [49:28<2:00:22,  5.47it/s]

{'loss': 0.0826, 'learning_rate': 3.16e-05, 'epoch': 1.84}


 38%|███▊      | 23501/62500 [50:32<1:57:59,  5.51it/s]

{'loss': 0.0703, 'learning_rate': 3.12e-05, 'epoch': 1.88}


 38%|███▊      | 24001/62500 [51:37<1:56:51,  5.49it/s]

{'loss': 0.0724, 'learning_rate': 3.08e-05, 'epoch': 1.92}


 39%|███▉      | 24501/62500 [52:41<1:55:02,  5.50it/s]

{'loss': 0.0696, 'learning_rate': 3.04e-05, 'epoch': 1.96}


 40%|████      | 25001/62500 [53:46<1:55:24,  5.42it/s]

{'loss': 0.0709, 'learning_rate': 3e-05, 'epoch': 2.0}


 41%|████      | 25501/62500 [54:50<1:52:12,  5.50it/s]

{'loss': 0.0482, 'learning_rate': 2.96e-05, 'epoch': 2.04}


 42%|████▏     | 26001/62500 [55:55<1:50:27,  5.51it/s]

{'loss': 0.0366, 'learning_rate': 2.9199999999999998e-05, 'epoch': 2.08}


 42%|████▏     | 26501/62500 [56:59<1:49:22,  5.49it/s]

{'loss': 0.0451, 'learning_rate': 2.88e-05, 'epoch': 2.12}


 43%|████▎     | 27001/62500 [58:04<1:47:21,  5.51it/s]

{'loss': 0.0353, 'learning_rate': 2.84e-05, 'epoch': 2.16}


 44%|████▍     | 27501/62500 [59:08<1:47:44,  5.41it/s]

{'loss': 0.0343, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.2}


 45%|████▍     | 28001/62500 [1:00:12<1:44:22,  5.51it/s]

{'loss': 0.0308, 'learning_rate': 2.7600000000000003e-05, 'epoch': 2.24}


 46%|████▌     | 28501/62500 [1:01:17<1:42:55,  5.51it/s]

{'loss': 0.0259, 'learning_rate': 2.7200000000000004e-05, 'epoch': 2.28}


 46%|████▋     | 29001/62500 [1:02:21<1:42:11,  5.46it/s]

{'loss': 0.0219, 'learning_rate': 2.6800000000000004e-05, 'epoch': 2.32}


 47%|████▋     | 29501/62500 [1:03:26<1:40:08,  5.49it/s]

{'loss': 0.0207, 'learning_rate': 2.64e-05, 'epoch': 2.36}


 48%|████▊     | 30001/62500 [1:04:30<1:38:49,  5.48it/s]

{'loss': 0.023, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.4}


 49%|████▉     | 30501/62500 [1:05:35<1:37:11,  5.49it/s]

{'loss': 0.0265, 'learning_rate': 2.5600000000000002e-05, 'epoch': 2.44}


 50%|████▉     | 31001/62500 [1:06:39<1:35:33,  5.49it/s]

{'loss': 0.0281, 'learning_rate': 2.5200000000000003e-05, 'epoch': 2.48}


 50%|█████     | 31501/62500 [1:07:44<1:34:27,  5.47it/s]

{'loss': 0.0278, 'learning_rate': 2.48e-05, 'epoch': 2.52}


 51%|█████     | 32001/62500 [1:08:48<1:32:59,  5.47it/s]

{'loss': 0.0207, 'learning_rate': 2.44e-05, 'epoch': 2.56}


 52%|█████▏    | 32501/62500 [1:09:53<1:31:04,  5.49it/s]

{'loss': 0.0279, 'learning_rate': 2.4e-05, 'epoch': 2.6}


 53%|█████▎    | 33001/62500 [1:10:57<1:29:48,  5.47it/s]

{'loss': 0.0272, 'learning_rate': 2.36e-05, 'epoch': 2.64}


 54%|█████▎    | 33501/62500 [1:12:01<1:28:02,  5.49it/s]

{'loss': 0.0272, 'learning_rate': 2.32e-05, 'epoch': 2.68}


 54%|█████▍    | 34001/62500 [1:13:06<1:26:09,  5.51it/s]

{'loss': 0.0359, 'learning_rate': 2.2800000000000002e-05, 'epoch': 2.72}


 55%|█████▌    | 34501/62500 [1:14:10<1:24:29,  5.52it/s]

{'loss': 0.0251, 'learning_rate': 2.2400000000000002e-05, 'epoch': 2.76}


 56%|█████▌    | 35001/62500 [1:15:15<1:23:51,  5.47it/s]

{'loss': 0.0232, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.8}


 57%|█████▋    | 35501/62500 [1:16:19<1:22:10,  5.48it/s]

{'loss': 0.0213, 'learning_rate': 2.16e-05, 'epoch': 2.84}


 58%|█████▊    | 36001/62500 [1:17:24<1:20:34,  5.48it/s]

{'loss': 0.0199, 'learning_rate': 2.12e-05, 'epoch': 2.88}


 58%|█████▊    | 36501/62500 [1:18:28<1:19:36,  5.44it/s]

{'loss': 0.048, 'learning_rate': 2.08e-05, 'epoch': 2.92}


 59%|█████▉    | 37001/62500 [1:19:33<1:17:15,  5.50it/s]

{'loss': 0.0301, 'learning_rate': 2.04e-05, 'epoch': 2.96}


 60%|██████    | 37501/62500 [1:20:37<1:17:25,  5.38it/s]

{'loss': 0.0271, 'learning_rate': 2e-05, 'epoch': 3.0}


 61%|██████    | 38001/62500 [1:21:42<1:13:54,  5.52it/s]

{'loss': 0.0232, 'learning_rate': 1.9600000000000002e-05, 'epoch': 3.04}


 62%|██████▏   | 38501/62500 [1:22:46<1:12:59,  5.48it/s]

{'loss': 0.0356, 'learning_rate': 1.9200000000000003e-05, 'epoch': 3.08}


 62%|██████▏   | 39001/62500 [1:23:51<1:11:21,  5.49it/s]

{'loss': 0.0388, 'learning_rate': 1.88e-05, 'epoch': 3.12}


 63%|██████▎   | 39501/62500 [1:24:55<1:10:28,  5.44it/s]

{'loss': 0.0337, 'learning_rate': 1.84e-05, 'epoch': 3.16}


 64%|██████▍   | 40001/62500 [1:25:59<1:08:09,  5.50it/s]

{'loss': 0.0407, 'learning_rate': 1.8e-05, 'epoch': 3.2}


 65%|██████▍   | 40501/62500 [1:27:04<1:06:25,  5.52it/s]

{'loss': 0.0317, 'learning_rate': 1.76e-05, 'epoch': 3.24}


 66%|██████▌   | 41001/62500 [1:28:08<1:05:28,  5.47it/s]

{'loss': 0.0388, 'learning_rate': 1.7199999999999998e-05, 'epoch': 3.28}


 66%|██████▋   | 41501/62500 [1:29:13<1:04:01,  5.47it/s]

{'loss': 0.0299, 'learning_rate': 1.6800000000000002e-05, 'epoch': 3.32}


 67%|██████▋   | 42001/62500 [1:30:17<1:03:04,  5.42it/s]

{'loss': 0.028, 'learning_rate': 1.6400000000000002e-05, 'epoch': 3.36}


 68%|██████▊   | 42501/62500 [1:31:22<1:00:29,  5.51it/s]

{'loss': 0.0234, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.4}


 69%|██████▉   | 43001/62500 [1:32:26<59:12,  5.49it/s]  

{'loss': 0.021, 'learning_rate': 1.56e-05, 'epoch': 3.44}


 70%|██████▉   | 43501/62500 [1:33:31<57:22,  5.52it/s]  

{'loss': 0.0261, 'learning_rate': 1.52e-05, 'epoch': 3.48}


 70%|███████   | 44001/62500 [1:34:35<56:06,  5.50it/s]  

{'loss': 0.0247, 'learning_rate': 1.48e-05, 'epoch': 3.52}


 71%|███████   | 44501/62500 [1:35:40<54:55,  5.46it/s]  

{'loss': 0.0265, 'learning_rate': 1.44e-05, 'epoch': 3.56}


 72%|███████▏  | 45001/62500 [1:36:44<53:01,  5.50it/s]

{'loss': 0.0267, 'learning_rate': 1.4000000000000001e-05, 'epoch': 3.6}


 73%|███████▎  | 45501/62500 [1:37:49<51:39,  5.49it/s]

{'loss': 0.0203, 'learning_rate': 1.3600000000000002e-05, 'epoch': 3.64}


 74%|███████▎  | 46001/62500 [1:38:54<50:03,  5.49it/s]

{'loss': 0.0199, 'learning_rate': 1.32e-05, 'epoch': 3.68}


 74%|███████▍  | 46501/62500 [1:39:58<49:13,  5.42it/s]

{'loss': 0.0179, 'learning_rate': 1.2800000000000001e-05, 'epoch': 3.72}


 75%|███████▌  | 47001/62500 [1:41:03<46:56,  5.50it/s]

{'loss': 0.0175, 'learning_rate': 1.24e-05, 'epoch': 3.76}


 76%|███████▌  | 47501/62500 [1:42:07<45:26,  5.50it/s]

{'loss': 0.022, 'learning_rate': 1.2e-05, 'epoch': 3.8}


 77%|███████▋  | 48001/62500 [1:43:12<43:55,  5.50it/s]

{'loss': 0.0179, 'learning_rate': 1.16e-05, 'epoch': 3.84}


 78%|███████▊  | 48501/62500 [1:44:16<42:34,  5.48it/s]

{'loss': 0.016, 'learning_rate': 1.1200000000000001e-05, 'epoch': 3.88}


 78%|███████▊  | 49001/62500 [1:45:21<41:25,  5.43it/s]

{'loss': 0.0192, 'learning_rate': 1.08e-05, 'epoch': 3.92}


 79%|███████▉  | 49501/62500 [1:46:25<39:24,  5.50it/s]

{'loss': 0.0139, 'learning_rate': 1.04e-05, 'epoch': 3.96}


 80%|████████  | 50000/62500 [1:47:29<42:25,  4.91it/s]Saving model checkpoint to after-bert-trainer\checkpoint-50000
Configuration saved in after-bert-trainer\checkpoint-50000\config.json


{'loss': 0.0206, 'learning_rate': 1e-05, 'epoch': 4.0}


Model weights saved in after-bert-trainer\checkpoint-50000\pytorch_model.bin
 81%|████████  | 50501/62500 [1:48:42<36:28,  5.48it/s]  

{'loss': 0.0134, 'learning_rate': 9.600000000000001e-06, 'epoch': 4.04}


 82%|████████▏ | 51001/62500 [1:49:47<35:21,  5.42it/s]

{'loss': 0.0202, 'learning_rate': 9.2e-06, 'epoch': 4.08}


 82%|████████▏ | 51501/62500 [1:50:51<33:29,  5.47it/s]

{'loss': 0.0133, 'learning_rate': 8.8e-06, 'epoch': 4.12}


 83%|████████▎ | 52001/62500 [1:51:56<32:00,  5.47it/s]

{'loss': 0.0179, 'learning_rate': 8.400000000000001e-06, 'epoch': 4.16}


 84%|████████▍ | 52501/62500 [1:53:00<30:37,  5.44it/s]

{'loss': 0.0127, 'learning_rate': 8.000000000000001e-06, 'epoch': 4.2}


 85%|████████▍ | 53001/62500 [1:54:05<28:38,  5.53it/s]

{'loss': 0.0143, 'learning_rate': 7.6e-06, 'epoch': 4.24}


 86%|████████▌ | 53501/62500 [1:55:09<27:13,  5.51it/s]

{'loss': 0.014, 'learning_rate': 7.2e-06, 'epoch': 4.28}


 86%|████████▋ | 54001/62500 [1:56:14<25:39,  5.52it/s]

{'loss': 0.0171, 'learning_rate': 6.800000000000001e-06, 'epoch': 4.32}


 87%|████████▋ | 54501/62500 [1:57:18<24:15,  5.50it/s]

{'loss': 0.0225, 'learning_rate': 6.4000000000000006e-06, 'epoch': 4.36}


 88%|████████▊ | 55001/62500 [1:58:23<22:55,  5.45it/s]

{'loss': 0.0132, 'learning_rate': 6e-06, 'epoch': 4.4}


 89%|████████▉ | 55501/62500 [1:59:27<21:11,  5.51it/s]

{'loss': 0.019, 'learning_rate': 5.600000000000001e-06, 'epoch': 4.44}


 90%|████████▉ | 56001/62500 [2:00:32<19:46,  5.48it/s]

{'loss': 0.0183, 'learning_rate': 5.2e-06, 'epoch': 4.48}


 90%|█████████ | 56501/62500 [2:01:36<18:11,  5.50it/s]

{'loss': 0.018, 'learning_rate': 4.800000000000001e-06, 'epoch': 4.52}


 91%|█████████ | 57001/62500 [2:02:40<16:41,  5.49it/s]

{'loss': 0.0143, 'learning_rate': 4.4e-06, 'epoch': 4.56}


 92%|█████████▏| 57501/62500 [2:03:45<15:08,  5.50it/s]

{'loss': 0.0169, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.6}


 93%|█████████▎| 58001/62500 [2:04:49<13:52,  5.40it/s]

{'loss': 0.0139, 'learning_rate': 3.6e-06, 'epoch': 4.64}


 94%|█████████▎| 58501/62500 [2:05:54<12:02,  5.53it/s]

{'loss': 0.016, 'learning_rate': 3.2000000000000003e-06, 'epoch': 4.68}


 94%|█████████▍| 59001/62500 [2:06:58<10:33,  5.52it/s]

{'loss': 0.0135, 'learning_rate': 2.8000000000000003e-06, 'epoch': 4.72}


 95%|█████████▌| 59501/62500 [2:08:03<09:06,  5.49it/s]

{'loss': 0.0148, 'learning_rate': 2.4000000000000003e-06, 'epoch': 4.76}


 96%|█████████▌| 60001/62500 [2:09:08<07:38,  5.46it/s]

{'loss': 0.0149, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.8}


 97%|█████████▋| 60501/62500 [2:10:12<06:05,  5.47it/s]

{'loss': 0.0169, 'learning_rate': 1.6000000000000001e-06, 'epoch': 4.84}


 98%|█████████▊| 61001/62500 [2:11:17<04:34,  5.46it/s]

{'loss': 0.017, 'learning_rate': 1.2000000000000002e-06, 'epoch': 4.88}


 98%|█████████▊| 61501/62500 [2:12:21<03:01,  5.49it/s]

{'loss': 0.0179, 'learning_rate': 8.000000000000001e-07, 'epoch': 4.92}


 99%|█████████▉| 62001/62500 [2:13:26<01:30,  5.49it/s]

{'loss': 0.0177, 'learning_rate': 4.0000000000000003e-07, 'epoch': 4.96}


100%|██████████| 62500/62500 [2:14:30<00:00,  4.93it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 62500/62500 [2:14:30<00:00,  7.74it/s]

{'loss': 0.0198, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 8073.667, 'train_samples_per_second': 30.965, 'train_steps_per_second': 7.741, 'train_loss': 0.06624277964782715, 'epoch': 5.0}





TrainOutput(global_step=62500, training_loss=0.06624277964782715, metrics={'train_runtime': 8073.667, 'train_samples_per_second': 30.965, 'train_steps_per_second': 7.741, 'train_loss': 0.06624277964782715, 'epoch': 5.0})

In [7]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0
)

s = f'{tokenizer.mask_token} to the sky!!!'
fill_mask(s)

[{'score': 0.9999397993087769,
  'token': 65345,
  'token_str': '$ T S L A',
  'sequence': 'to the sky!!!'},
 {'score': 1.6432995835202746e-05,
  'token': 64149,
  'token_str': '$ A A P L',
  'sequence': 'to the sky!!!'},
 {'score': 8.08750519354362e-06,
  'token': 65545,
  'token_str': '$ A M Z N',
  'sequence': 'to the sky!!!'},
 {'score': 6.454405593103729e-06,
  'token': 65659,
  'token_str': '$ F B',
  'sequence': 'to the sky!!!'},
 {'score': 3.896896942023886e-06,
  'token': 65350,
  'token_str': '$ N V D A',
  'sequence': 'to the sky!!!'}]

In [8]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    2097 MB |    2892 MB |  208162 GB |  208160 GB |
|       from large pool |    2093 MB |    2869 MB |  206862 GB |  206860 GB |
|       from small pool |       4 MB |      24 MB |    1300 GB |    1300 GB |
|---------------------------------------------------------------------------|
| Active memory         |    2097 MB |    2892 MB |  208162 GB |  208160 GB |
|       from large pool |    2093 MB |    2869 MB |  206862 GB |  206860 GB |
|       from small pool |       4 MB |      24 MB |    1300 GB |    1300 GB |
|---------------------------------------------------------------