In [None]:
import json 
import os
from tqdm import tqdm

from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from datasets import concatenate_datasets, load_dataset, load_from_disk

model_path = "pretrained-bert"
vocab_size = 30_522
max_length = 512

dataset = load_from_disk("/home/tom/fssd/bert_dataset_longer_test")
d = dataset.train_test_split(test_size=0.1)
train_dataset = d["train"]
test_dataset = d["test"]
# train_dataset = load_from_disk("/home/tom/fsas/bert_dataset_longer_train")

tokenizer = BertTokenizerFast.from_pretrained(model_path)

# initialize the model with the config
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens # for the Masked Language Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True, mlm_probability=0.2)
training_args = TrainingArguments(
    output_dir=model_path,  # output directory to where save model checkpoint
    evaluation_strategy="epoch", # evaluate each `logging_steps` steps
    overwrite_output_dir=True, 
    num_train_epochs=10, # number of training epochs, feel free to tweak
    per_device_train_batch_size=20, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=4, # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64, # evaluation batch size
    logging_steps=500, # evaluate, log and save model checkpoints every 1000 step
    save_steps=10000,
    save_total_limit=300,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss)
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
# train the model
# whether you don't have much space so you
# let only  3 model weights saved in the disk

trainer.train(resume_from_checkpoint=True)
# trainer.train(resume_from_checkpoint=f"{model_path}/checkpoint-50000")
# trainer.train()


In [None]:
import argparse
import os
import shutil
import time
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data.distributed import DistributedSampler
from transformers import BertConfig, BertForMaskedLM
from datasets import concatenate_datasets, load_dataset, load_from_disk

parser = argparse.ArgumentParser(description='Bert')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=3, type=int,
                    metavar='N')
parser.add_argument('-wd', '--weight_decay', default=1e-3, type=float,
                    metavar='N')
parser.add_argument('--local_rank', default=0, type=int, help='node rank for distributed training')
args = parser.parse_args()
torch.distributed.init_process_group(backend="nccl") # 初始化 print("Use GPU: {} for training".format(args.local_rank))
# create model
model_path = "pretrained-bert"
vocab_size = 30_522
max_length = 512
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

torch.cuda.set_device(args.local_rank) # 当前显卡 model = model.cuda() # 模型放在显卡上

model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                  output_device=args.local_rank, find_unused_parameters=True) # 数据并行
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.AdamW(model.parameters(), args.lr, weight_decay=args.weight_decay)

# train_dataset = Cityscaples()
test_dataset = load_from_disk("/home/tom/fsas/bert_dataset_longer_test")
train_sampler = DistributedSampler(train_dataset) # 分配数据
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size,
                                           shuffle=False, num_workers=args.workers,
                                           pin_memory=True, sampler=train_sampler)

In [None]:
import json 
import os

from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from tokenizers import BertWordPieceTokenizer
from datasets import concatenate_datasets, load_dataset, load_from_disk

model_path = "/home/tom/fsas/pretrained-bert"
# load the model checkpoint
model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-200000")) # load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
# perform predictions
examples = [
  "Today's most trending hashtags on [MASK] is Donald Trump",
  "The [MASK] was cloudy yesterday, but today it's rainy.",
]
for example in examples:
    for prediction in fill_mask(example):
        print(f"{prediction['sequence']}, confidence: {prediction['score']}")
    print("="*50)