In [1]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from nltk.tokenize import sent_tokenize
import nltk
import time
import pickle
from torch.utils.data import DataLoader
from torch.optim import AdamW

path = "/global/cscratch1/sd/ajaybati/pickles/"


In [2]:
train_dataloader = torch.load(path+"train_dataloader.pickle")
validation_dataloader = torch.load(path+"validation_dataloader.pickle")

In [71]:
len(train_dataloader)

228362

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [6]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
EPOCHS = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [9]:
batch = None
for stuff in train_dataloader:
    batch = stuff
    break

In [63]:
len(train_dataloader)

228362

In [14]:
inputrain = batch[0][0].view(1,64)
att = batch[1][0].view(1,64)
predictions = model(inputrain, attention_mask = att)


(tensor([[[-6.5316, -6.4824, -6.4827,  ..., -5.8874, -5.7394, -3.6863],
          [-9.2984, -8.9206, -9.0902,  ..., -9.7039, -9.8126, -7.3922],
          [-6.9126, -6.5310, -6.7128,  ..., -7.0007, -7.6280, -5.1045],
          ...,
          [-3.5252, -3.6193, -3.4169,  ..., -3.1680, -4.0851, -2.5623],
          [-3.3453, -3.4004, -3.2655,  ..., -2.8436, -3.9969, -2.1850],
          [-6.1991, -6.0576, -6.3188,  ..., -6.6437, -6.4148, -5.6771]]],
        grad_fn=<AddBackward0>),)

In [69]:
len(predictions)

32

In [27]:
torch.argmax(predictions[0][0,batch[3][0][1]])

tensor(1011)

In [33]:
torch.tensor(1011)==torch.argmax(predictions[0][0,batch[3][0][1]])

tensor(True)

In [54]:
inputrain

tensor([[  101,  2012,  9808,  2232,  1024, 21358,  1010,   103,   103, 18582,
          1013,   103,  1010,  2531,  1003,   103,  1016,  2140,  8991,  1024,
           103, 11888,  3973,  5665,   103,  2931,  4688,  2871,  5445,  1999,
          2793,  2007, 10256, 16464, 17964,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])

In [65]:
batch[3]

tensor([[ 2, 17, 18,  ...,  0,  0,  0],
        [ 1, 13, 18,  ...,  0,  0,  0],
        [ 3, 11, 15,  ...,  0,  0,  0],
        ...,
        [ 1,  2,  3,  ...,  0,  0,  0],
        [ 2,  3,  4,  ...,  0,  0,  0],
        [12, 13,  0,  ...,  0,  0,  0]])

In [70]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

def calc_accuracy(predictions, real_labels, mask_indices):
    score = 0
    total = 0
    for index,sent in enumerate(mask_indices):
        if list(sent).count(0)>1:
            for mask in sent:
                if mask!=0:
                    predicted_index = torch.argmax(predictions[index][0, int(mask)]).item()
                    if bool(predicted_index==real_labels[index][int(mask)]):
                        score+=1
                    total+=1
                else:
                    pass
        else:
            pass
    return score/total

In [None]:
#in general remember that there are some sentence where no masks exist
import random
import numpy as np
import time

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)


training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, EPOCHS):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        
        
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_input_ids_real = batch[2]
        b_input_mask_ids = batch[3]
        
        model.zero_grad()        

        loss, predictions = model(b_input_ids, 
                                  attention_mask=b_input_mask, 
                                  masked_lm_labels=b_input_ids)

        total_train_loss += loss.item()

        loss.backward()

        
        #stop exploding gradients problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_loss = 0
    nb_eval_steps = 0
    total_eval_accuracy = 0


    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: real ids
        #   [3]: mask ids for comparison
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_input_ids_real = batch[2]
        b_input_mask_ids = batch[3]
        
        
        with torch.no_grad():        

            (loss, logits) = model(b_input_ids, 
                                   attention_mask=b_input_mask, 
                                   masked_lm_labels=b_input_ids)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()   
        total_eval_accuracy += calc_accuracy(logits, b_input_ids_real, b_input_mask_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [1]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from nltk.tokenize import sent_tokenize
import nltk
import time
import datetime
import pickle
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import random
from transformers import get_linear_schedule_with_warmup
import nltk
from nltk.translate.bleu_score import SmoothingFunction
print('here')

here


In [3]:
path = "/global/cscratch1/sd/ajaybati/pickles/"

print("passed")
train_dataloader = torch.load(path+"train_dataloader.pickle")
print("pass 2")
validation_dataloader = torch.load(path+"validation_dataloader.pickle")
print("done")

passed
pass 2
done


In [8]:
import sys
sys.stdout = open("model_output.txt", "w")


if torch.cuda.is_available():
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')



tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.cuda()
print("done") 

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
EPOCHS = 2

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
print("done")



def getSent_pred(prediction,real_labels):
    sentlist_real = []
    sep_list = []
    for sent2 in real_labels:
        tokenized = tokenizer.convert_ids_to_tokens(sent2)
        sep = tokenized.index('[SEP]')
        sep_list.append(sep)
        sentlist_real.append(tokenized[1:sep])
    
    
    sentlist_ids = []
    sentlist = []
    for sent in prediction:
        word_list = []
        for word in sent:
            word_list.append(torch.argmax(word))
        sentlist_ids.append(word_list)
    
    for index,sent in enumerate(sentlist_ids):
        sentlist.append(tokenizer.convert_ids_to_tokens(sent)[1:sep_list[index]])
    return sentlist,sentlist_real

def bleu(p,r):
    smoothie = SmoothingFunction().method5
    bleu_list = []
    for index in range(len(p)):
        BLEUscore = nltk.translate.bleu_score.sentence_bleu(p[index],r[index],smoothing_function=smoothie)
        bleu_list.append(BLEUscore)
    return sum(bleu_list) / len(bleu_list)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

def calc_accuracy(prediction, real_labels, mask_indices):
    score = 0
    total = 0
    for step,sent in enumerate(mask_indices):
        if list(sent).count(0)!=40:
            for mask in sent:
                if int(mask)!=0:
                    predicted_index = int(torch.argmax(prediction[step,int(mask)]))
                    actual = int(real_labels[step][int(mask)])
                    if bool(predicted_index==actual):
                        score+=1
                    total+=1
                else:
                    pass

        else:
            pass
    
    p,r = getSent_pred(predictions,real_labels)
    
    
    accuracy = score/total
    try:
        bscore = bleu(p,r)
    except:
        bscore = "Unfortunately, not possible"
    return accuracy, bscore 
print("done")


# ==========================================================================================

#in general remember that there are some sentence where no masks exist
seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)


training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

print("starting...")
# For each epoch...
for epoch_i in range(0, EPOCHS):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_ids_real = batch[2].to(device)
        b_input_mask_ids = batch[3].to(device)
        
        model.zero_grad()        

        loss, predictions = model(b_input_ids, 
                                  attention_mask=b_input_mask, 
                                  masked_lm_labels=b_input_ids_real)
            

        total_train_loss += loss.item()
        
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print("*"*50)
            print(loss)
            print("*"*50)
            acc, bscore = calc_accuracy(predictions, b_input_ids_real, b_input_mask_ids)
            print("accuracy: ", acc, "bleu: ", bscore)
            print("="*100)

        loss.backward()

        
        #stop exploding gradients problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()
        
        

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_loss = 0
    nb_eval_steps = 0
    total_eval_accuracy = 0


    # Evaluate data for one epoch
    for step,batch in enumerate(validation_dataloader):
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: real ids
        #   [3]: mask ids for comparison
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_ids_real = batch[2].to(device)
        b_input_mask_ids = batch[3].to(device)
        
        
        with torch.no_grad():        

            (loss, logits) = model(b_input_ids, 
                                   attention_mask=b_input_mask, 
                                   masked_lm_labels=b_input_ids)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        accuracy, bleuscore = calc_accuracy(logits, b_input_ids_real, b_input_mask_ids)
        total_eval_accuracy += accuracy

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Avg Accuracy': avg_cal_accuracy,
            'Latest Bleu Score': bleuscore,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(path)
tokenizer.save_pretrained(path)

with open(path+'training_stats.pickle', 'wb') as f:
    pickle.dump(training_stats, f)
print("Completely Done!!!")

sys.stdout.close()

- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 