In [1]:
import torch
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
import TrainFunctions
import BertEnchoder
from transformers import BertTokenizer
import importlib
import tqdm
import pickle

In [2]:
data = TrainFunctions.Load_Data()

In [3]:
lebels = list(data[0].keys())

In [4]:
lebels

['input_ids', 'token_type_ids', 'attention_mask', 'labels']

In [5]:
dl_data = []
for i, d in enumerate(data):
    dl_data.append(TrainFunctions.cvt_dict_to_TensorDataset(d,batch_size=20,lebels=lebels))
    data[i] = None
    d = None

100%|██████████| 1756584/1756584 [00:34<00:00, 50740.13it/s]
100%|██████████| 1756584/1756584 [00:34<00:00, 50383.55it/s]
100%|██████████| 1756584/1756584 [00:34<00:00, 51340.37it/s]
100%|██████████| 1756584/1756584 [00:34<00:00, 51342.33it/s]
100%|██████████| 1742305/1742305 [00:33<00:00, 51289.52it/s]


In [6]:
train_lis = dl_data[1:]
test_dl = dl_data[0]

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [8]:
print(tokenizer)

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [9]:
print(tokenizer.vocab_size)

30522


In [10]:
print(torch.cuda.is_available())

True


In [11]:
device = torch.device('cuda')

In [12]:
model = BertEnchoder.EncoderOnly(vocabSize=tokenizer.vocab_size, embedDim=240, 
                                 numHeads=12,numLayers=12,numPosEmbeading=128,
                                 numSegEmbeading=2,padIdx=0)
model = model.to(device)
print(model)

EncoderOnly(
  (Embead): BERTmbeadings(
    (Embead): Embedding(30522, 240, padding_idx=0)
    (PosEmbead): Embedding(128, 240)
    (SegEmbeading): Embedding(2, 240)
  )
  (Blocks): ModuleList(
    (0-11): 12 x EncoderBLock(
      (Attantion): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=240, out_features=240, bias=True)
      )
      (MLP): Sequential(
        (0): Linear(in_features=240, out_features=960, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=960, out_features=240, bias=True)
      )
      (Norm1): LayerNorm((240,), eps=1e-05, elementwise_affine=True)
      (Norm2): LayerNorm((240,), eps=1e-05, elementwise_affine=True)
      (Drop): Dropout(p=0.1, inplace=True)
    )
  )
  (MLM): MaskedLangModeling(
    (MLP): Sequential(
      (0): Linear(in_features=240, out_features=240, bias=True)
      (1): GELU(approximate='none')
      (2): LayerNorm((240,), eps=1e-05, elementwise_affine=True)
      (3): Linear(i

In [13]:
x = 0
for dl in train_lis:
    x += len(dl)
print(x)

epochs = 1
criterion_nsp = torch.nn.BCEWithLogitsLoss()

ignore_idx = -100
criterion_mlm = torch.nn.CrossEntropyLoss(ignore_index=ignore_idx)

optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-4, weight_decay=1e-2)

steps = epochs * x
w_steps = max(1, int(steps * 0.1))
c_steps = max(1,(steps - w_steps))

Cosine = CosineAnnealingLR(optimizer,T_max=c_steps,eta_min= 1e-7)

warmup = LinearLR(optimizer,start_factor=1e-3, total_iters=w_steps)

scheduler = SequentialLR(optimizer,[warmup, Cosine], milestones=[w_steps])

350606


In [14]:
importlib.reload(TrainFunctions)
loss_list, model = TrainFunctions.train(model=model, dl_lis= train_lis[:1], mask_id= 103, p= 0.2,vocab_size = tokenizer.vocab_size,
                     criterion_mlm=criterion_mlm, ignore_idx = ignore_idx,criterion_nsp=criterion_nsp, optimizer=optimizer, 
                     seheduler=scheduler, clip_grad=0.5, test_dl=test_dl, device=device, epochs=1, enable_bf16 = True)

Epoch: 1
Training :
Data Loader No.0


100%|██████████| 87830/87830 [1:22:41<00:00, 17.70it/s, Train Loss =5.726731, accu=0.621678, masked_accu=0.226672, NSP_accu=0.831900, lr=9.33e-05]


Train Loss, accu, masked_accu, NSP_accu:  (5.726731323924407, 0.6216782162495226, 0.22667181998498992, 0.8318998692917617)
Testing :


100%|██████████| 87830/87830 [35:18<00:00, 41.46it/s, Train Loss =1.800904, accu=0.775870, masked_accu=0.350641, NSP_accu=0.925240]


Loss Model Saved at epoch :1
Accu Model Saved at epoch :1
Masked Accu Model Saved at epoch :1
NSP Accu Model Saved at epoch :1
Test Loss, accu, masked_accu, NSP_accu:  (1.8009042887773625, 0.7758695864031776, 0.35064106500674136, 0.9252401251519996)
Model Saved


In [17]:
importlib.reload(TrainFunctions)
loss_list, model = TrainFunctions.train(model=model, dl_lis= train_lis[1:], mask_id= 103, p= 0.2,vocab_size = tokenizer.vocab_size,
                     criterion_mlm=criterion_mlm, ignore_idx = ignore_idx,criterion_nsp=criterion_nsp, optimizer=optimizer, 
                     seheduler=scheduler, clip_grad=0.5, test_dl=test_dl, device=device, epochs=1, enable_bf16 = True)

Epoch: 1
Training :
Data Loader No.0


100%|██████████| 87830/87830 [1:21:40<00:00, 17.92it/s, Train Loss =3.972474, accu=0.802307, masked_accu=0.372531, NSP_accu=0.944815, lr=5.85e-05]


Train Loss, accu, masked_accu, NSP_accu:  (3.9724736111427132, 0.8023069651806787, 0.37253081276222577, 0.9448150501200057)
Data Loader No.1


100%|██████████| 87830/87830 [1:21:35<00:00, 17.94it/s, Train Loss =3.626934, accu=0.831821, masked_accu=0.405293, NSP_accu=0.965567, lr=1.77e-05]


Train Loss, accu, masked_accu, NSP_accu:  (3.626933663819491, 0.8318212098737863, 0.40529316941833826, 0.965567260091177)
Data Loader No.2


100%|██████████| 87116/87116 [1:21:22<00:00, 17.84it/s, Train Loss =3.501225, accu=0.841038, masked_accu=0.417963, NSP_accu=0.973344, lr=1e-07]   


Train Loss, accu, masked_accu, NSP_accu:  (3.5012248085136175, 0.8410380573435388, 0.41796318981972075, 0.9733439323195422)
Testing :


100%|██████████| 87830/87830 [35:39<00:00, 41.04it/s, Train Loss =1.064270, accu=0.846259, masked_accu=0.428372, NSP_accu=0.976187]


Loss Model Saved at epoch :1
Accu Model Saved at epoch :1
Masked Accu Model Saved at epoch :1
NSP Accu Model Saved at epoch :1
Test Loss, accu, masked_accu, NSP_accu:  (1.0642704053627587, 0.8462585042545269, 0.42837166396634463, 0.9761873044500007)
Model Saved


In [18]:
with open("TrainTest_Loss_Accu.pkl", 'wb') as f:
    pickle.dump(obj=loss_list, file=f)