In [None]:
# Import Libraries

import numpy as np 
import pandas as pd
import os
from torchsummary import summary
import sys
import torch
from time import time
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
from torch.autograd import Variable
import transformers
import random
import pickle
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load Datasets

with open('train.pkl', 'rb') as f:
    trainset = pickle.load(f)

with open('test.pkl', 'rb') as f:
    testset = pickle.load(f)

In [None]:
# Create a list 'indexes' and store the order of sentences of each example in the trainset in this list.
# So this is a list of lists

indexes = []
for instance in trainset:
  indexes.append(instance['indexes'])

In [None]:
# Load the BERT Tokenizer (I have used the bert-base-uncased as bert-large was not loading on my machine)

from transformers import BertTokenizer, RobertaTokenizer

print('Loading BERT tokenizer...')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# Create Tokenization for the training set sentences that can be then fed as input to BERT

def bert_tokenization(dataset):

  input_ids = []
  labels = []
  attention_masks = []
  token_type_ids = []

  counter = 0
  for instance in dataset:
    if counter % 1000 == 0: print('current step is =: ', counter)
    counter = counter + 1 

    indexes = instance['indexes']
    inputs_set = []
    labels_set = []
    attention_mask_set = []
    token_type_id_set = []
    for i in range(0,6):
      for j in range(i+1,6):
        s1 = instance['sentences'][i]
        s2 = instance['sentences'][j]
        encoded_sentences = tokenizer.encode_plus(s1, s2, 
                          max_length = 100, 
                          truncation= True,
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          return_attention_mask = True,
                          return_token_type_ids = True,
                          pad_to_max_length = True,
                          return_tensors = 'pt')
        input = encoded_sentences['input_ids']
        attention_mask = encoded_sentences['attention_mask']
        token_type_id = encoded_sentences['token_type_ids']

        inputs_set.append(input)
        attention_mask_set.append(attention_mask)
        token_type_id_set.append(token_type_id)
        index1 = indexes[i]
        index2 = indexes[j]
        if (index1 < index2):
          label = 1
        else: 
          label = 0
        labels_set.append(label)
    input_ids.append(inputs_set)
    labels.append(labels_set)
    attention_masks.append(attention_mask_set)
    token_type_ids.append(token_type_id_set)
  return input_ids, attention_masks, token_type_ids, labels

In [None]:
# In this step, I have selected 60000 sentence samples from the train.pkl dataset to train the model 

n_size = 60000

# The tokens are passed to the variables input_ids, attention_masks, token_type_ids, labels

input_ids, attention_masks, token_type_ids, labels = bert_tokenization(trainset[0:n_size])

In [None]:
# In this step, the training sentences and labels are converted to Pytorch format 

# convert input_ids 
for each in input_ids:
  each = torch.cat(each, dim=0)
b = torch.zeros([len(input_ids), 15, 100], dtype = torch.int32)
for i in range(0,len(input_ids)):
  for j in range(0,15):
    b[i][j] = input_ids[i][j]
input_ids = b
#print(input_ids.shape) # torch.Size([1000, 15, 100])

# convert labels
c = torch.zeros([len(labels), 15], dtype = torch.int64)
for i in range(0,len(labels)):
  c[i] = torch.Tensor(labels[i])
labels = c
#print(labels.shape) # torch.Size([1000, 15, 100])

# convert attention_masks
c = torch.zeros([len(attention_masks), 15, 100], dtype = torch.int32)
for i in range(0,len(attention_masks)):
  for j in range(0,15):
    c[i][j] = attention_masks[i][j]
attention_masks = c
#print(attention_masks.shape) # torch.Size([1000, 15, 100])

# convert token type ids
c = torch.zeros([len(token_type_ids), 15, 100], dtype = torch.int32)
for i in range(0,len(token_type_ids)):
  for j in range(0,15):
    c[i][j] = token_type_ids[i][j]
token_type_ids = c
#print(token_type_ids.shape) # torch.Size([1000, 15, 100])

In [None]:
# The tokens are reshaped to 60000*15 times 100.

input_ids = input_ids.reshape(n_size*15,100)
attention_masks = attention_masks.reshape(n_size*15,100)
token_type_ids = token_type_ids.reshape(n_size*15,100)
labels = labels.reshape(n_size*15)

In [None]:
# We get 900000 sentence pairs of token length 100 each

print(input_ids.shape)
print(attention_masks.shape)
print(token_type_ids.shape) 
print(labels.shape)

In [None]:
# The training dataset has been split into a training set and a validation set 

set_train_size = int(0.9 * n_size)
train_size = set_train_size *15
val_size = (n_size - set_train_size)*15

print(train_size/15, val_size/15, train_size/15 + val_size/15)
print(train_size, val_size, train_size + val_size)

In [None]:
# The training and the validation sets have been converted into PyTorch format

train_dataset = TensorDataset(input_ids[:train_size], 
                              attention_masks[:train_size], 
                              token_type_ids[:train_size], 
                              labels[:train_size])
val_dataset = TensorDataset(input_ids[train_size:], 
                            attention_masks[train_size:], 
                            token_type_ids[train_size:], 
                            labels[train_size:])
len(train_dataset), len(val_dataset)

In [None]:
# This step performs BERT tokenization for the test dataset 

def bert_tokenization2(dataset):
  
  input_ids = []
  
  attention_masks = []
  token_type_ids = []

  counter = 0
  for instance in dataset:
    if counter % 1000 == 0: print('current step is =: ', counter)
    counter = counter + 1 

    
    inputs_set = []
    
    attention_mask_set = []
    token_type_id_set = []
    for i in range(0,6):
      for j in range(i+1,6):
        s1 = instance['sentences'][i]
        s2 = instance['sentences'][j]
        encoded_sentences = tokenizer.encode_plus(s1, s2, 
                          max_length = 100, 
                          truncation= True,
                          add_special_tokens = True, 
                          return_attention_mask = True,
                          return_token_type_ids = True,
                          pad_to_max_length = True,
                          return_tensors = 'pt')
        input = encoded_sentences['input_ids']
        attention_mask = encoded_sentences['attention_mask']
        token_type_id = encoded_sentences['token_type_ids']

        inputs_set.append(input)
        attention_mask_set.append(attention_mask)
        token_type_id_set.append(token_type_id)

    input_ids.append(inputs_set)
    
    attention_masks.append(attention_mask_set)
    token_type_ids.append(token_type_id_set)
  return input_ids, attention_masks, token_type_ids

In [None]:
#Once again, the variables input_ids1, attention_masks1, token_type_ids1 store the results of the tokenization 
# step
input_ids1, attention_masks1, token_type_ids1 = bert_tokenization2(testset)

In [None]:
# This step performs conversion to Pytorch of the test dataset

# input ids to torch format

for each in input_ids1:
  each = torch.cat(each, dim=0)
b = torch.zeros([len(input_ids1), 15, 100], dtype = torch.int32)
for i in range(0,len(input_ids1)):
  for j in range(0,15):
    b[i][j] = input_ids1[i][j]
input_ids1 = b
print(input_ids1.shape) 


# attention_masks to torch format
c = torch.zeros([len(attention_masks1), 15, 100], dtype = torch.int32)
for i in range(0,len(attention_masks1)):
  for j in range(0,15):
    c[i][j] = attention_masks1[i][j]
attention_masks1 = c
print(attention_masks1.shape) 

# token type ids to torch format

c = torch.zeros([len(token_type_ids1), 15, 100], dtype = torch.int32)
for i in range(0,len(token_type_ids1)):
  for j in range(0,15):
    c[i][j] = token_type_ids1[i][j]
token_type_ids1 = c
print(token_type_ids1.shape) 

In [None]:
# The input to the algorithm is converted to 30000 by 100 format

input_ids1 = input_ids1.reshape(2000*15,100)
attention_masks1 = attention_masks1.reshape(2000*15,100)
token_type_ids1 = token_type_ids1.reshape(2000*15,100)

In [None]:
# The tensor format for the test dataset is prepared by this step

test_dataset = TensorDataset(input_ids1, attention_masks1, token_type_ids1)

In [None]:
# Dataloader has been prepared for the train and validation set with a batch size of 16

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
           batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# I have used BertforSentencePrediction 

#from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertForNextSentencePrediction, AdamW, BertConfig
# from transformers import RobertaForSequenceClassification

#model = BertForSequenceClassification.from_pretrained(
model = BertForNextSentencePrediction.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    #"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.

epochs = 1

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

In [None]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_tokentypeids = batch[2].to(device)
        b_labels = batch[3].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # In PyTorch, calling `model` will in turn call the model's `forward` 
        # function and pass down the arguments. The `forward` function is 
        # documented here: 
        # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
        # The results are returned in a results object, documented here:
        # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.SequenceClassifierOutput
        # Specifically, we'll get the loss (because we provided labels) and the
        # "logits"--the model outputs prior to activation.
        result = model(b_input_ids, 
                       token_type_ids=b_input_tokentypeids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_tokentypeids = batch[2].to(device)
        b_labels = batch[3].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            result = model(b_input_ids, 
                           token_type_ids=b_input_tokentypeids, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the 
        # output values prior to applying an activation function like the 
        # softmax.
        loss = result.loss
        logits = result.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

In [None]:
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
#prediction_data = TensorDataset(input_ids1, attention_masks1)
#prediction_sampler = SequentialSampler(prediction_data)
#prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# ========================================
    #               Test
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

print("")
print("Running Validation on toy set...")

t0 = time.time()

# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

# Tracking variables 
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

# Evaluate data for one epoch
label = np.array([])
for batch in test_dataloader:
    
    # Unpack this training batch from our dataloader. 
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using 
    # the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids':      batch[0],
            'attention_mask': batch[1],
            'token_type_ids': batch[2]
          }
     
    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        result = model(**inputs)

    # Get the loss and "logits" output by the model. The "logits" are the 
    # output values prior to applying an activation function like the 
    # softmax.
    loss = result.loss
    logits = result.logits
        
    # Accumulate the validation loss.
    # total_eval_loss += loss.item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    label = np.append(label, pred_flat)

    # label_ids =inputs['labels'].to('cpu').numpy()
    

    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    # total_eval_accuracy += flat_accuracy(logits, label_ids)
    

# Report the final accuracy for this validation run.
# avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
# print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
# print("  Validation took: {:}".format(validation_time))
test_label = label.reshape(2000,15)

In [None]:
def topSort(data):
  orders = []
  for instance in data:
    g = Graph(6)
    for j in range(0,15):
      pred = instance[j]
      pos_s1, pos_s2 = get_pos(j)

      if pred == 1: 
        g.addEdge(pos_s1, pos_s2)
      if pred == 0: 
        g.addEdge(pos_s2, pos_s1)
    while g.isCyclic():
      g.isCyclic()

    sorted = g.topologicalSort()
    arr = []
    for i in range(0,6):
      arr.append(sorted.index(i))
  
    orders.append(arr)
  return orders

In [None]:
sentence_order = topSort(test_label)

In [None]:
# construct pdct to submit.
k = {'ID': range(1,2001), 
     'index1': range(2000),
     'index2': range(2000),
     'index3': range(2000),
     'index4': range(2000),
     'index5': range(2000),
     'index6': range(2000)}

df = pd.DataFrame(data=k)

In [None]:
for i in range(0,2000):
  instance = sentence_order[i]
  for j in range(1,7):
    df.iloc[i,j]=instance[j-1]
df.to_csv("/home/bharat/Desktop/......./sentence_order",index=False) # Directory to store the sentence_order file  