# Next Instruction Prediction Training


In [1]:
import torch

torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


True

# DATASET GENERATION

In [2]:
#!/usr/bin/env python3

import sys,os
from elftools.elf.elffile import ELFFile
from elftools.elf.segments import Segment
from capstone import *
from capstone.x86 import *





data_dir_path = "./data/binaries/"
dir_file_list = os.listdir(data_dir_path)

with open('./data/instruction_clusters.txt', 'w') as data_file:
    for filename in dir_file_list:
        filePath = os.path.join(data_dir_path,filename)

        fh = open(filePath, 'rb')
        bin_bytearray = bytearray(fh.read())
        
        with open(filePath, 'rb') as f:
            elf = ELFFile(f)
            dwarfinfo = elf.get_dwarf_info()
            aranges = dwarfinfo.get_aranges()
            print(filename, len(aranges.entries))
            for arange in aranges.entries:

                entry = arange.begin_addr
                exit  = arange.begin_addr + arange.length
                ops = bin_bytearray[entry: exit]

                md = Cs(CS_ARCH_X86, CS_MODE_64)
                md.detail = True
                for inst in md.disasm(ops, entry):

                    data_file.write(inst.mnemonic+" "+inst.op_str+";")
                data_file.write('\n')






AttributeError: 'NoneType' object has no attribute 'entries'

# Creating the pipeline

In [None]:
from transformers import BertTokenizer, BertForNextSentencePrediction,BertForPreTraining
import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer = BertTokenizer.from_pretrained("./binary-tokenizer")
# model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')


In [None]:
delim = ';'
with open('./data/instruction_clusters.txt', 'r') as fp:
    text = fp.read().split('\n')

In [None]:
text = text[:12]

We need to split sentences into consecutive, and non-consecutive sequences.

We have to deal with edge-cases too - for example where there is only a single sentence within a paragraph as with the three examples above (in comparison to below where we can easily split into multiple sentences).

In [None]:
# text[51].split(delim)

We'll assign a 50% probability of using the genuine next sentence, and 50% probability of using another random sentence.

To make this simpler, we'll create a *'bag'* of individual sentences to pull from when selecting a random sentence B.

In [None]:
bag = [instruction for instruction_cluster in text for instruction in instruction_cluster.split(delim)  if instruction!= '']
bag_size = len(bag)
print(bag_size)

In [None]:
bag

And now we create our 50/50 NIP training data.

In [None]:
import random

history = []
next_instruction = []
label = []

page_len = 5
instruction_pages = []
for instruction_cluster in text:
    instructions = [
        instruction for instruction in instruction_cluster.split(delim) if instruction != ''
    ]
    if len(instructions)>page_len:
        
        for i in range(0,len(instructions),page_len):
            instruction_pages.append(instructions[i:i+page_len])
        
print(len(instruction_pages))
print(instruction_pages[0])

for instruction_page in instruction_pages:
    
#     instructions = [
#         instruction for instruction in instruction_page.split(';') if instruction != ''
#     ]
    
    
#     num_instructions = len(instruction_page)
    
    

#     start = random.randint(0, num_instructions-2)
    # 50/50 whether is IsNextSentence or NotNextSentence
    if random.random() >= 0.5:
        # this is IsNextSentence
        history.append(delim.join(instruction_page[:-1]))
        next_instruction.append(instruction_page[-1])
        label.append(0)
    else:
        index = random.randint(0, bag_size-1)
        # this is NotNextSentence
        history.append(delim.join(instruction_page[:-1]))
        next_instruction.append(bag[index])
        label.append(1)

In [None]:
print(len(label))
for i in range(3):
    print(label[i])
    print('->',history[i] , '\n')
    print('# ',next_instruction[i] , '\n')

Our data is now ready for tokenization, this time we truncate/pad each token to the same length of *512* tokens.

In [None]:
inputs = tokenizer(history, next_instruction, return_tensors='pt', 
                   max_length=64, truncation=True, padding='max_length')

In [None]:
inputs.keys()

We can see that the *token_type_ids* tensors have been built correctly (eg **1** indicating sentence B tokens) by checking the first instance of *token_type_ids*:

In [None]:
inputs.token_type_ids[0]

The **0** tokens following our sentence B tokens correspond to *PAD* tokens.

Alongside this, we need to create a *labels* tensor too - which corresponds to the values contained within our `label` variable. Our *labels* tensor must be a *LongTensor*, and we will need to transpose the tensor so that it matches our other tensors' dimensionality.

In [None]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T


And the labels tensor is simply a clone of the input_ids tensor before masking.

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
inputs.labels[0]

Now we mask tokens in the input_ids tensor using the 15% probability for MLM - ensuring we don't mask CLS, SEP, or PAD tokens.

In [None]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [None]:
mask_arr[0]
# inputs.input_ids.shape[0]

And now take the indices of each True value within each vector.

In [None]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [None]:
selection[0]

Then apply these indices to each row in input_ids, assigning each value at these indices a value of 103.

In [None]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [None]:
inputs.keys()

The `inputs` tensors are now ready, and we can begin building the model input pipeline for training. We first create a PyTorch dataset from our data.

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

Initialize our data using the `MeditationDataset` class.

In [None]:
dataset = MeditationsDataset(inputs)

In [None]:
train_size = int(0.8 * len(dataset))
validation_size = len(dataset) - train_size

train_dataset, validation_dataset = torch.utils.data.random_split(dataset, [train_size, validation_size])

And initialize the dataloader, which we'll be using to load our data into the model during training.

In [None]:
BATCH_SIZE = 8
train_loader      = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# 

Now we can move onto setting up the training loop. First we setup GPU/CPU usage.

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [None]:
from sklearn.metrics import precision_recall_fscore_support , accuracy_score
import numpy as np


Now we can move onto the training loop, we'll train for a couple of epochs (change `epochs` to modify this).

In [None]:
# odict_keys(['loss', 'prediction_logits', 'seq_relationship_logits'])

In [None]:

from numpy import *
import math
import matplotlib.pyplot as plt


def plot_graph(training_data, validation_data , label ):

    font_size = 10
    x_labels = [ i for i in range(len(training_data)) ]

    plt.ylabel(' F1 ',fontsize=font_size)

    
    plt.plot(x_labels, training_data , 'r') 
    plt.plot(x_labels, validation_data , 'b') 
    
    plt.xlabel("Epoch", fontsize=font_size)
    
    plt.title(label,fontsize=font_size)
    plt.legend(['Training', 'Validation'], loc='upper left')
    
    plt.savefig('./output/'+label+'.pdf')
    plt.show()



In [None]:
from transformers import AdamW
from tqdm import tqdm  # for our progress bar


# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)



epochs = 1000000
counter = 0

global_instruction_metrices = []
global_token_metrices = []

v_global_instruction_metrices = []
v_global_token_metrices = []


for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    train_loop = tqdm(train_loader, leave=True)
    
    
    instruction_predictions_all, instruction_ground_truths_all = None, None
    token_predictions_all, token_ground_truths_all = None, None
    
    # activate training mode
    model.train()
    for N,batch in enumerate(train_loop):

        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)


        token_prediction = torch.argmax(outputs.prediction_logits, axis=-1)

        
        token_prediction = token_prediction.detach().cpu().numpy().flatten()
        token_ground_truth = labels.detach().cpu().numpy().flatten()
        
        
        instruction_prediction = torch.argmax(outputs.seq_relationship_logits, axis=-1)
        instruction_prediction   = instruction_prediction.detach().cpu().numpy().flatten()
        instruction_ground_truth = next_sentence_label.detach().cpu().numpy().flatten()
        
        if N==0:
            instruction_predictions_all   = instruction_prediction
            instruction_ground_truths_all = instruction_ground_truth
            
            token_predictions_all         = token_prediction
            token_ground_truths_all       = token_ground_truth  
        else:
            instruction_predictions_all   = np.concatenate((instruction_predictions_all, instruction_prediction))
            instruction_ground_truths_all = np.concatenate((instruction_ground_truths_all, instruction_ground_truth))
            token_predictions_all   = np.concatenate((token_predictions_all, token_prediction))
            token_ground_truths_all = np.concatenate((token_ground_truths_all, token_ground_truth))
            

        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        train_loop.set_description(f'Epoch {epoch}')
        train_loop.set_postfix(loss=loss.item())
    
    
    instruction_accuracy = (accuracy_score(instruction_ground_truths_all,instruction_predictions_all))
    instruction_precision, instruction_recall, instruction_f1, _ = precision_recall_fscore_support(instruction_ground_truths_all,instruction_predictions_all, average='binary')
    
    token_accuracy = (accuracy_score(token_ground_truths_all, token_predictions_all))
    token_precision, token_recall, token_f1, _ = precision_recall_fscore_support(token_ground_truths_all,token_predictions_all,average='weighted')
    
    print("Training: ",  ' Instruction f1: ', instruction_f1 , '   Token f1',token_f1)
    global_instruction_metrices.append(instruction_f1)
    global_token_metrices.append( token_f1) 

    ###########################################
    ###############  EVAL Validation  #########
    ###########################################

    with torch.no_grad():
        model.eval()
#         v_predictions_all, v_ground_truths_all = None, None
        
        v_instruction_predictions_all, v_instruction_ground_truths_all = None, None
        v_token_predictions_all, v_token_ground_truths_all = None, None
    
    
        validation_loop = tqdm(validation_loader, leave=True)
        for N,v_batch in enumerate(validation_loop):
            
            
            
            v_input_ids = v_batch['input_ids'].to(device)
            v_token_type_ids = v_batch['token_type_ids'].to(device)
            v_attention_mask = v_batch['attention_mask'].to(device)
            v_next_sentence_label = v_batch['next_sentence_label'].to(device)
            v_labels = v_batch['labels'].to(device)
            # process
            v_outputs = model(v_input_ids, attention_mask=v_attention_mask,
                            token_type_ids=v_token_type_ids,
                            next_sentence_label=v_next_sentence_label,
                            labels=v_labels)
        
            
            
            v_token_prediction = torch.argmax(v_outputs.prediction_logits, axis=-1)
            v_token_prediction = v_token_prediction.detach().cpu().numpy().flatten()
            v_token_ground_truth = v_labels.detach().cpu().numpy().flatten()
            v_instruction_prediction = torch.argmax(v_outputs.seq_relationship_logits, axis=-1)
            v_instruction_prediction   = v_instruction_prediction.detach().cpu().numpy().flatten()
            v_instruction_ground_truth = v_next_sentence_label.detach().cpu().numpy().flatten()



        

            if N==0:
                v_instruction_predictions_all   = v_instruction_prediction
                v_instruction_ground_truths_all = v_instruction_ground_truth

                v_token_predictions_all         = v_token_prediction
                v_token_ground_truths_all       = v_token_ground_truth  
            else:
                v_instruction_predictions_all   = np.concatenate((v_instruction_predictions_all, v_instruction_prediction))
                v_instruction_ground_truths_all = np.concatenate((v_instruction_ground_truths_all, v_instruction_ground_truth))
                v_token_predictions_all   = np.concatenate((v_token_predictions_all, v_token_prediction))
                v_token_ground_truths_all = np.concatenate((v_token_ground_truths_all, v_token_ground_truth))
            

        v_instruction_accuracy = (accuracy_score(v_instruction_ground_truths_all,v_instruction_predictions_all))
        v_instruction_precision, v_instruction_recall, v_instruction_f1, _ = precision_recall_fscore_support(v_instruction_ground_truths_all,v_instruction_predictions_all, average='binary')
        
        v_token_accuracy = (accuracy_score(v_token_ground_truths_all, v_token_predictions_all))
        v_token_precision, v_token_recall, v_token_f1, _ = precision_recall_fscore_support(v_token_ground_truths_all,v_token_predictions_all,average='weighted')
    
        print("Validation: ", "Instruction F1: ", v_instruction_f1,  "   Token F1: ",v_token_f1)
        
        v_global_instruction_metrices.append(v_instruction_f1)
        v_global_token_metrices.append(v_token_f1) 
#   
    
    plot_graph(global_instruction_metrices, v_global_instruction_metrices, 'Next Sentence Prediction Scores')
    plot_graph(global_token_metrices, v_global_token_metrices, 'Masked Token Prediction Scores')

In [None]:
# # Save the trained model weights
# training_model.save_weights("weights/wghts" + str(epoch + 1) + ".ckpt")