In [2]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from nltk.tokenize import sent_tokenize
import nltk
import time
import pickle
from torch.utils.data import DataLoader
from torch.optim import AdamW

In [15]:
with open("pickles/train_dataloader.pickle",'rb') as f:
    train_dataloader = pickle.load(f)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
EPOCHS = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
for step,batch in train_dataloader:
    print(step,batch,len(batch))
    break

In [20]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
len(train_dataloader)

228362

In [None]:
import random
import numpy as np
import time

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)


training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, EPOCHS):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        if step % 30 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        
        
        b_input_ids = batch[0]
        b_input_mask = batch[1]

        model.zero_grad()        

        loss, predictions = model(b_input_ids, 
                                  attention_mask=b_input_mask, 
                                  masked_lm_labels=b_input_ids)

        total_train_loss += loss.item()

        loss.backward()

        
        #stop exploding gradients problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        
        
        with torch.no_grad():        

            (loss, logits) = model(b_input_ids, 
                                   attention_mask=b_input_mask, 
                                   masked_lm_labels=b_input_ids)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()        


    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [4]:
attention_masks_randomized70 = torch.load("pickles/attention_masks_randomized70.pickle")
input_ids_randomized70 = torch.load("pickles/input_ids_randomized70.pickle")
attention_masks_randomized30 = torch.load("pickles/attention_masks_randomized30.pickle")
input_ids_randomized30 = torch.load("pickles/input_ids_randomized30.pickle")

In [5]:
print(len(input_ids_randomized70),len(attention_masks_randomized70),len(input_ids_randomized30),len(attention_masks_randomized30),len(input_ids_randomized70)+len(input_ids_randomized30))

5501003 5501003 2785321 2785321 8286324


In [7]:
fullist2 = list(attention_masks_randomized70)
for x in attention_masks_randomized30:
    fullist2.append(x)


In [8]:
fullist = list(input_ids_randomized70)
for x in input_ids_randomized30:
    fullist.append(x)


In [None]:
fullist_changed2 = []
for tens in fullist2:
    tens = tens.view(1,64)
    fullist_changed2.append(tens)
fullist_changed2

In [None]:
fullist_changed = []
for tens in fullist:
    tens = tens.view(1,64)
    fullist_changed.append(tens)
fullist_changed

In [11]:
attention_masks_randomized100 = torch.cat(tuple(fullist_changed2),0)
attention_masks_randomized100

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [12]:
input_ids_randomized100 = torch.cat(tuple(fullist_changed),0)
input_ids_randomized100

tensor([[  101,  9634,  3058,  ...,  2566,  3597,   102],
        [  101,  2016,  2001,  ...,     0,     0,     0],
        [  101,  6920,  1010,  ...,     0,     0,     0],
        ...,
        [  101,   103,   103,  ...,     0,     0,     0],
        [  101,  8605,  1024,  ...,     0,     0,     0],
        [  101, 27011,  2052,  ...,     0,     0,     0]])

In [13]:
torch.save(input_ids_randomized100,"pickles/input_ids_randomized100.pickle")
torch.save(attention_masks_randomized100,"pickles/attention_masks_randomized100.pickle")

In [4]:
input_ids_randomized100 = torch.load("/global/cscratch1/sd/ajaybati/pickles/input_ids_randomized100.pickle")

In [6]:
len(input_ids_randomized100)

8286324