In [13]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM

config = AutoConfig.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForMaskedLM.from_config(config)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\douli/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading configuration file https://huggingface.co/bert-base

In [2]:
from glob import glob
from datasets import Dataset

import re
import pandas as pd
def mask_data_loading(url):
    def stock_symbol_mask(sentense):
        pattern = r'\$[A-Z]*'
        # symbol += re.findall(pattern, sentense)
        result = re.sub(pattern, tokenizer.mask_token, sentense)

        return result

    with open(url, 'r', encoding='utf-8') as f:
        df = pd.read_json(f)
        data = df.copy()
        data = data.loc[df['sentiment'].notnull()]
        data['sentiment'] = pd.Categorical(data['sentiment'])
        data['sentense'] = data['body'].map(stock_symbol_mask)
        data['labels'] = data['body']
        symbols = set()
        for symbol_list in data['body'].str.findall(r'\$[A-Z]+'):
            for symbol in symbol_list:
                symbols.add(symbol)
        return data, symbols

data_url = '../crawler/stock/data/**.json'
url = glob(data_url)[-1]
data, symbols = mask_data_loading(url)

dataset = Dataset.from_pandas(data.loc[:, ['labels', 'sentense']])
dataset = dataset.remove_columns('__index_level_0__')

special_tokens_dict = {'additional_special_tokens': list(symbols)}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(29625, 768)

In [3]:
def encode(example):
    label = tokenizer(example['labels'], padding='max_length', truncation=True)
    # 101, 51, 1234, 12541, 151
    result = tokenizer(example['sentense'], padding='max_length', truncation=True)
    # 101, 103, 103, 103
    result['label_ids'] = label['input_ids']

    # masked_position = [i for i in range(len(result['input_ids'])) if result['input_ids'][i] == tokenizer.mask_token_id]
    # result['decoder_input_ids'] = label['input_ids']
    # result['labels'] = [-100 for i in label['input_ids']]
    # for i in range(len(result['labels'])):
    #     if not i in masked_position:
    #         result['labels'][i] = -100

    return result

encoded_dataset = dataset.map(encode, batched=True)

# print(encoded_dataset[0]['sentense'])
# print(encoded_dataset[0]['input_ids'])
# print(encoded_dataset[0]['labels'])
# print(encoded_dataset[0]['label_ids'])
# print(tokenizer.ids_to_tokens)
# print(tokenizer.decode(encoded_dataset[0]['label_ids']))

100%|██████████| 15/15 [00:19<00:00,  1.29s/ba]


In [4]:
from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# samples = encoded_dataset[:2]
# for chunk in data_collator(samples)["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")

# Training

In [5]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer

# training_args = TrainingArguments(
#     output_dir="test_trainer",
#     per_device_train_batch_size=2,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=encoded_dataset,
#     compute_metrics=compute_metrics,
# )

# Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=4,
    num_train_epochs=50
)

train_datset = encoded_dataset.shuffle().select(range(4000))

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_datset,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: sentense. If sentense are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4000
  Num Epochs = 50
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 50000
  1%|          | 500/50000 [02:47<4:34:52,  3.00it/s]Saving model checkpoint to test_trainer\checkpoint-500
Configuration saved in test_trainer\checkpoint-500\config.json


{'loss': 0.176, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.5}


Model weights saved in test_trainer\checkpoint-500\pytorch_model.bin
  2%|▏         | 1000/50000 [05:37<4:31:07,  3.01it/s]Saving model checkpoint to test_trainer\checkpoint-1000
Configuration saved in test_trainer\checkpoint-1000\config.json


{'loss': 0.0081, 'learning_rate': 4.9e-05, 'epoch': 1.0}


Model weights saved in test_trainer\checkpoint-1000\pytorch_model.bin
  3%|▎         | 1500/50000 [08:31<4:26:50,  3.03it/s] Saving model checkpoint to test_trainer\checkpoint-1500
Configuration saved in test_trainer\checkpoint-1500\config.json


{'loss': 0.0064, 'learning_rate': 4.85e-05, 'epoch': 1.5}


Model weights saved in test_trainer\checkpoint-1500\pytorch_model.bin
  4%|▍         | 2000/50000 [11:20<4:26:05,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-2000
Configuration saved in test_trainer\checkpoint-2000\config.json


{'loss': 0.0071, 'learning_rate': 4.8e-05, 'epoch': 2.0}


Model weights saved in test_trainer\checkpoint-2000\pytorch_model.bin
  5%|▌         | 2500/50000 [14:11<4:23:50,  3.00it/s] Saving model checkpoint to test_trainer\checkpoint-2500
Configuration saved in test_trainer\checkpoint-2500\config.json


{'loss': 0.006, 'learning_rate': 4.75e-05, 'epoch': 2.5}


Model weights saved in test_trainer\checkpoint-2500\pytorch_model.bin
  6%|▌         | 3000/50000 [17:01<4:19:01,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-3000
Configuration saved in test_trainer\checkpoint-3000\config.json


{'loss': 0.0075, 'learning_rate': 4.7e-05, 'epoch': 3.0}


Model weights saved in test_trainer\checkpoint-3000\pytorch_model.bin
  7%|▋         | 3500/50000 [19:53<4:17:35,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-3500
Configuration saved in test_trainer\checkpoint-3500\config.json


{'loss': 0.0053, 'learning_rate': 4.6500000000000005e-05, 'epoch': 3.5}


Model weights saved in test_trainer\checkpoint-3500\pytorch_model.bin
  8%|▊         | 4000/50000 [22:44<4:13:50,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-4000
Configuration saved in test_trainer\checkpoint-4000\config.json


{'loss': 0.006, 'learning_rate': 4.600000000000001e-05, 'epoch': 4.0}


Model weights saved in test_trainer\checkpoint-4000\pytorch_model.bin
  9%|▉         | 4500/50000 [25:34<4:11:41,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-4500
Configuration saved in test_trainer\checkpoint-4500\config.json


{'loss': 0.0053, 'learning_rate': 4.55e-05, 'epoch': 4.5}


Model weights saved in test_trainer\checkpoint-4500\pytorch_model.bin
 10%|█         | 5000/50000 [28:26<4:08:25,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-5000
Configuration saved in test_trainer\checkpoint-5000\config.json


{'loss': 0.0043, 'learning_rate': 4.5e-05, 'epoch': 5.0}


Model weights saved in test_trainer\checkpoint-5000\pytorch_model.bin
 11%|█         | 5500/50000 [31:16<4:06:37,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-5500
Configuration saved in test_trainer\checkpoint-5500\config.json


{'loss': 0.0038, 'learning_rate': 4.4500000000000004e-05, 'epoch': 5.5}


Model weights saved in test_trainer\checkpoint-5500\pytorch_model.bin
 12%|█▏        | 6000/50000 [34:07<4:03:21,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-6000
Configuration saved in test_trainer\checkpoint-6000\config.json


{'loss': 0.0041, 'learning_rate': 4.4000000000000006e-05, 'epoch': 6.0}


Model weights saved in test_trainer\checkpoint-6000\pytorch_model.bin
 13%|█▎        | 6500/50000 [36:57<4:00:22,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-6500
Configuration saved in test_trainer\checkpoint-6500\config.json


{'loss': 0.0029, 'learning_rate': 4.35e-05, 'epoch': 6.5}


Model weights saved in test_trainer\checkpoint-6500\pytorch_model.bin
 14%|█▍        | 7000/50000 [39:47<3:57:23,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-7000
Configuration saved in test_trainer\checkpoint-7000\config.json


{'loss': 0.0029, 'learning_rate': 4.3e-05, 'epoch': 7.0}


Model weights saved in test_trainer\checkpoint-7000\pytorch_model.bin
 15%|█▌        | 7500/50000 [42:38<3:55:06,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-7500
Configuration saved in test_trainer\checkpoint-7500\config.json


{'loss': 0.0021, 'learning_rate': 4.25e-05, 'epoch': 7.5}


Model weights saved in test_trainer\checkpoint-7500\pytorch_model.bin
 16%|█▌        | 8000/50000 [45:28<3:53:16,  3.00it/s] Saving model checkpoint to test_trainer\checkpoint-8000
Configuration saved in test_trainer\checkpoint-8000\config.json


{'loss': 0.0021, 'learning_rate': 4.2e-05, 'epoch': 8.0}


Model weights saved in test_trainer\checkpoint-8000\pytorch_model.bin
 17%|█▋        | 8500/50000 [48:18<3:49:59,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-8500
Configuration saved in test_trainer\checkpoint-8500\config.json


{'loss': 0.0016, 'learning_rate': 4.15e-05, 'epoch': 8.5}


Model weights saved in test_trainer\checkpoint-8500\pytorch_model.bin
 18%|█▊        | 9000/50000 [51:09<3:46:36,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-9000
Configuration saved in test_trainer\checkpoint-9000\config.json


{'loss': 0.0017, 'learning_rate': 4.1e-05, 'epoch': 9.0}


Model weights saved in test_trainer\checkpoint-9000\pytorch_model.bin
 19%|█▉        | 9500/50000 [53:59<3:44:31,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-9500
Configuration saved in test_trainer\checkpoint-9500\config.json


{'loss': 0.0011, 'learning_rate': 4.05e-05, 'epoch': 9.5}


Model weights saved in test_trainer\checkpoint-9500\pytorch_model.bin
 20%|██        | 10000/50000 [56:50<3:40:48,  3.02it/s]Saving model checkpoint to test_trainer\checkpoint-10000
Configuration saved in test_trainer\checkpoint-10000\config.json


{'loss': 0.0014, 'learning_rate': 4e-05, 'epoch': 10.0}


Model weights saved in test_trainer\checkpoint-10000\pytorch_model.bin
 21%|██        | 10500/50000 [59:40<3:37:49,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-10500
Configuration saved in test_trainer\checkpoint-10500\config.json


{'loss': 0.0011, 'learning_rate': 3.9500000000000005e-05, 'epoch': 10.5}


Model weights saved in test_trainer\checkpoint-10500\pytorch_model.bin
 22%|██▏       | 11000/50000 [1:02:30<3:36:30,  3.00it/s]Saving model checkpoint to test_trainer\checkpoint-11000
Configuration saved in test_trainer\checkpoint-11000\config.json


{'loss': 0.001, 'learning_rate': 3.9000000000000006e-05, 'epoch': 11.0}


Model weights saved in test_trainer\checkpoint-11000\pytorch_model.bin
 23%|██▎       | 11500/50000 [1:05:21<3:32:42,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-11500
Configuration saved in test_trainer\checkpoint-11500\config.json


{'loss': 0.0009, 'learning_rate': 3.85e-05, 'epoch': 11.5}


Model weights saved in test_trainer\checkpoint-11500\pytorch_model.bin
 24%|██▍       | 12000/50000 [1:08:12<3:30:24,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-12000
Configuration saved in test_trainer\checkpoint-12000\config.json


{'loss': 0.0009, 'learning_rate': 3.8e-05, 'epoch': 12.0}


Model weights saved in test_trainer\checkpoint-12000\pytorch_model.bin
 25%|██▌       | 12500/50000 [1:11:06<3:27:18,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-12500
Configuration saved in test_trainer\checkpoint-12500\config.json


{'loss': 0.0007, 'learning_rate': 3.7500000000000003e-05, 'epoch': 12.5}


Model weights saved in test_trainer\checkpoint-12500\pytorch_model.bin
 26%|██▌       | 13000/50000 [1:13:57<3:24:59,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-13000
Configuration saved in test_trainer\checkpoint-13000\config.json


{'loss': 0.0007, 'learning_rate': 3.7e-05, 'epoch': 13.0}


Model weights saved in test_trainer\checkpoint-13000\pytorch_model.bin
 27%|██▋       | 13500/50000 [1:16:48<3:22:18,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-13500
Configuration saved in test_trainer\checkpoint-13500\config.json


{'loss': 0.0006, 'learning_rate': 3.65e-05, 'epoch': 13.5}


Model weights saved in test_trainer\checkpoint-13500\pytorch_model.bin
 28%|██▊       | 14000/50000 [1:19:38<3:19:07,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-14000
Configuration saved in test_trainer\checkpoint-14000\config.json


{'loss': 0.0007, 'learning_rate': 3.6e-05, 'epoch': 14.0}


Model weights saved in test_trainer\checkpoint-14000\pytorch_model.bin
 29%|██▉       | 14500/50000 [1:22:28<3:15:53,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-14500
Configuration saved in test_trainer\checkpoint-14500\config.json


{'loss': 0.0006, 'learning_rate': 3.55e-05, 'epoch': 14.5}


Model weights saved in test_trainer\checkpoint-14500\pytorch_model.bin
 30%|███       | 15000/50000 [1:25:19<3:12:56,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-15000
Configuration saved in test_trainer\checkpoint-15000\config.json


{'loss': 0.0006, 'learning_rate': 3.5e-05, 'epoch': 15.0}


Model weights saved in test_trainer\checkpoint-15000\pytorch_model.bin
 31%|███       | 15500/50000 [1:28:09<3:10:40,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-15500
Configuration saved in test_trainer\checkpoint-15500\config.json


{'loss': 0.0005, 'learning_rate': 3.45e-05, 'epoch': 15.5}


Model weights saved in test_trainer\checkpoint-15500\pytorch_model.bin
 32%|███▏      | 16000/50000 [1:31:00<3:07:38,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-16000
Configuration saved in test_trainer\checkpoint-16000\config.json


{'loss': 0.0005, 'learning_rate': 3.4000000000000007e-05, 'epoch': 16.0}


Model weights saved in test_trainer\checkpoint-16000\pytorch_model.bin
 33%|███▎      | 16500/50000 [1:33:50<3:04:53,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-16500
Configuration saved in test_trainer\checkpoint-16500\config.json


{'loss': 0.0005, 'learning_rate': 3.35e-05, 'epoch': 16.5}


Model weights saved in test_trainer\checkpoint-16500\pytorch_model.bin
 34%|███▍      | 17000/50000 [1:36:41<3:02:02,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-17000
Configuration saved in test_trainer\checkpoint-17000\config.json


{'loss': 0.0005, 'learning_rate': 3.3e-05, 'epoch': 17.0}


Model weights saved in test_trainer\checkpoint-17000\pytorch_model.bin
 35%|███▌      | 17500/50000 [1:39:32<2:59:54,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-17500
Configuration saved in test_trainer\checkpoint-17500\config.json


{'loss': 0.0004, 'learning_rate': 3.2500000000000004e-05, 'epoch': 17.5}


Model weights saved in test_trainer\checkpoint-17500\pytorch_model.bin
 36%|███▌      | 18000/50000 [1:42:23<2:56:34,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-18000
Configuration saved in test_trainer\checkpoint-18000\config.json


{'loss': 0.0005, 'learning_rate': 3.2000000000000005e-05, 'epoch': 18.0}


Model weights saved in test_trainer\checkpoint-18000\pytorch_model.bin
 37%|███▋      | 18500/50000 [1:45:15<2:55:34,  2.99it/s] Saving model checkpoint to test_trainer\checkpoint-18500
Configuration saved in test_trainer\checkpoint-18500\config.json


{'loss': 0.0004, 'learning_rate': 3.15e-05, 'epoch': 18.5}


Model weights saved in test_trainer\checkpoint-18500\pytorch_model.bin
 38%|███▊      | 19000/50000 [1:48:12<2:52:10,  3.00it/s] Saving model checkpoint to test_trainer\checkpoint-19000
Configuration saved in test_trainer\checkpoint-19000\config.json


{'loss': 0.0004, 'learning_rate': 3.1e-05, 'epoch': 19.0}


Model weights saved in test_trainer\checkpoint-19000\pytorch_model.bin
 39%|███▉      | 19500/50000 [1:51:04<2:48:06,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-19500
Configuration saved in test_trainer\checkpoint-19500\config.json


{'loss': 0.0004, 'learning_rate': 3.05e-05, 'epoch': 19.5}


Model weights saved in test_trainer\checkpoint-19500\pytorch_model.bin
 40%|████      | 20000/50000 [1:53:57<2:45:42,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-20000
Configuration saved in test_trainer\checkpoint-20000\config.json


{'loss': 0.0003, 'learning_rate': 3e-05, 'epoch': 20.0}


Model weights saved in test_trainer\checkpoint-20000\pytorch_model.bin
 41%|████      | 20500/50000 [1:56:49<2:43:25,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-20500
Configuration saved in test_trainer\checkpoint-20500\config.json


{'loss': 0.0003, 'learning_rate': 2.95e-05, 'epoch': 20.5}


Model weights saved in test_trainer\checkpoint-20500\pytorch_model.bin
 42%|████▏     | 21000/50000 [1:59:41<2:39:56,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-21000
Configuration saved in test_trainer\checkpoint-21000\config.json


{'loss': 0.0003, 'learning_rate': 2.9e-05, 'epoch': 21.0}


Model weights saved in test_trainer\checkpoint-21000\pytorch_model.bin
 43%|████▎     | 21500/50000 [2:02:36<2:38:03,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-21500
Configuration saved in test_trainer\checkpoint-21500\config.json


{'loss': 0.0003, 'learning_rate': 2.8499999999999998e-05, 'epoch': 21.5}


Model weights saved in test_trainer\checkpoint-21500\pytorch_model.bin
 44%|████▍     | 22000/50000 [2:05:29<2:34:26,  3.02it/s] Saving model checkpoint to test_trainer\checkpoint-22000
Configuration saved in test_trainer\checkpoint-22000\config.json


{'loss': 0.0003, 'learning_rate': 2.8000000000000003e-05, 'epoch': 22.0}


Model weights saved in test_trainer\checkpoint-22000\pytorch_model.bin
 45%|████▌     | 22500/50000 [2:08:23<2:31:27,  3.03it/s] Saving model checkpoint to test_trainer\checkpoint-22500
Configuration saved in test_trainer\checkpoint-22500\config.json


{'loss': 0.0002, 'learning_rate': 2.7500000000000004e-05, 'epoch': 22.5}


Model weights saved in test_trainer\checkpoint-22500\pytorch_model.bin
 46%|████▌     | 23000/50000 [2:11:17<2:29:34,  3.01it/s] Saving model checkpoint to test_trainer\checkpoint-23000
Configuration saved in test_trainer\checkpoint-23000\config.json


{'loss': 0.0004, 'learning_rate': 2.7000000000000002e-05, 'epoch': 23.0}


RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\caffe2\serialize\inline_container.cc:300] . unexpected pos 317232064 vs 317231952

In [7]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0
)

s = '[MASK] to the sky!!!'
fill_mask(s)

[{'score': 0.9999986886978149,
  'token': 29351,
  'token_str': '$ T S L A',
  'sequence': 'to the sky!!!'},
 {'score': 2.3993442255232367e-07,
  'token': 29039,
  'token_str': '$ I M P P',
  'sequence': 'to the sky!!!'},
 {'score': 1.1034241254037624e-07,
  'token': 29448,
  'token_str': '$ M S F T',
  'sequence': 'to the sky!!!'},
 {'score': 7.492760545346755e-08,
  'token': 29010,
  'token_str': '$ P L T R',
  'sequence': 'to the sky!!!'},
 {'score': 7.289293790790907e-08,
  'token': 29250,
  'token_str': '$ S P Y',
  'sequence': 'to the sky!!!'}]

In [None]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())