# Next Instruction Prediction Training


In [1]:
import torch

torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


True

# DATASET GENERATION

In [2]:
#!/usr/bin/env python3

import sys,os
from elftools.elf.elffile import ELFFile
from elftools.elf.segments import Segment
from capstone import *
from capstone.x86 import *





data_dir_path = "./data/binaries/"
dir_file_list = os.listdir(data_dir_path)

with open('./data/instruction_clusters.txt', 'w') as data_file:
    for filename in dir_file_list:
        filePath = os.path.join(data_dir_path,filename)

        fh = open(filePath, 'rb')
        bin_bytearray = bytearray(fh.read())
        
        with open(filePath, 'rb') as f:
            elf = ELFFile(f)
            dwarfinfo = elf.get_dwarf_info()
            aranges = dwarfinfo.get_aranges()
            print(filename, len(aranges.entries))
            for arange in aranges.entries:

                entry = arange.begin_addr
                exit  = arange.begin_addr + arange.length
                ops = bin_bytearray[entry: exit]

                md = Cs(CS_ARCH_X86, CS_MODE_64)
                md.detail = True
                for inst in md.disasm(ops, entry):

                    data_file.write(inst.mnemonic+" "+inst.op_str+";")
                data_file.write('\n')






gitwipe 4
gitkeys 4
gitview 140
gitps 147
gitfm 341
gitwhich 6


# Creating the pipeline

In [3]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer = BertTokenizer.from_pretrained("./binary-tokenizer")
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
delim = ';'
with open('./data/instruction_clusters.txt', 'r') as fp:
    text = fp.read().split('\n')

In [5]:
text = text[:12]

We need to split sentences into consecutive, and non-consecutive sequences.

We have to deal with edge-cases too - for example where there is only a single sentence within a paragraph as with the three examples above (in comparison to below where we can easily split into multiple sentences).

In [6]:
# text[51].split(delim)

We'll assign a 50% probability of using the genuine next sentence, and 50% probability of using another random sentence.

To make this simpler, we'll create a *'bag'* of individual sentences to pull from when selecting a random sentence B.

In [7]:
bag = [instruction for instruction_cluster in text for instruction in instruction_cluster.split(delim)  if instruction!= '']
bag_size = len(bag)
print(bag_size)

493


In [8]:
bag

['endbr64 ',
 'push rbp',
 'mov rbp, rsp',
 'mov rdx, qword ptr [rip + 0x2d98]',
 'mov rax, qword ptr [rip + 0x2d81]',
 'lea rcx, [rip + 0xd62]',
 'mov rsi, rcx',
 'mov rdi, rax',
 'mov eax, 0',
 'call 0x1120',
 'mov edi, 1',
 'call 0x1170',
 'endbr64 ',
 'push rbp',
 'mov rbp, rsp',
 'sub rsp, 0x20',
 'mov dword ptr [rbp - 0x14], edi',
 'mov eax, dword ptr [rbp - 0x14]',
 'mov edx, 1',
 'mov esi, 0',
 'mov edi, eax',
 'call 0x1180',
 'mov qword ptr [rbp - 0x10], rax',
 'mov eax, dword ptr [rbp - 0x14]',
 'mov edx, 2',
 'mov esi, 0',
 'mov edi, eax',
 'call 0x1180',
 'mov qword ptr [rbp - 8], rax',
 'mov rcx, qword ptr [rbp - 0x10]',
 'mov eax, dword ptr [rbp - 0x14]',
 'mov edx, 0',
 'mov rsi, rcx',
 'mov edi, eax',
 'call 0x1180',
 'mov rax, qword ptr [rbp - 8]',
 'leave ',
 'ret ',
 'endbr64 ',
 'push rbp',
 'mov rbp, rsp',
 'sub rsp, 0x40',
 'mov qword ptr [rbp - 0x38], rdi',
 'mov rax, qword ptr [rbp - 0x38]',
 'mov esi, 2',
 'mov rdi, rax',
 'mov eax, 0',
 'call 0x1160',
 'mov dw

And now we create our 50/50 NIP training data.

In [9]:
import random

history = []
next_instruction = []
label = []

page_len = 5
instruction_pages = []
for instruction_cluster in text:
    instructions = [
        instruction for instruction in instruction_cluster.split(delim) if instruction != ''
    ]
    if len(instructions)>page_len:
        
        for i in range(0,len(instructions),page_len):
            instruction_pages.append(instructions[i:i+page_len])
        
print(len(instruction_pages))
print(instruction_pages[0])

for instruction_page in instruction_pages:
    
#     instructions = [
#         instruction for instruction in instruction_page.split(';') if instruction != ''
#     ]
    
    
#     num_instructions = len(instruction_page)
    
    

#     start = random.randint(0, num_instructions-2)
    # 50/50 whether is IsNextSentence or NotNextSentence
    if random.random() >= 0.5:
        # this is IsNextSentence
        history.append(delim.join(instruction_page[:-1]))
        next_instruction.append(instruction_page[-1])
        label.append(0)
    else:
        index = random.randint(0, bag_size-1)
        # this is NotNextSentence
        history.append(delim.join(instruction_page[:-1]))
        next_instruction.append(bag[index])
        label.append(1)

104
['endbr64 ', 'push rbp', 'mov rbp, rsp', 'mov rdx, qword ptr [rip + 0x2d98]', 'mov rax, qword ptr [rip + 0x2d81]']


In [10]:
print(len(label))
for i in range(3):
    print(label[i])
    print('->',history[i] , '\n')
    print('# ',next_instruction[i] , '\n')

104
1
-> endbr64 ;push rbp;mov rbp, rsp;mov rdx, qword ptr [rip + 0x2d98] 

#  movzx eax, al 

0
-> lea rcx, [rip + 0xd62];mov rsi, rcx;mov rdi, rax;mov eax, 0 

#  call 0x1120 

1
-> mov edi, 1 

#  nop  



Our data is now ready for tokenization, this time we truncate/pad each token to the same length of *512* tokens.

In [11]:
inputs = tokenizer(history, next_instruction, return_tensors='pt', 
                   max_length=128, truncation=True, padding='max_length')

In [12]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

We can see that the *token_type_ids* tensors have been built correctly (eg **1** indicating sentence B tokens) by checking the first instance of *token_type_ids*:

In [13]:
inputs.token_type_ids[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

The **0** tokens following our sentence B tokens correspond to *PAD* tokens.

Alongside this, we need to create a *labels* tensor too - which corresponds to the values contained within our `label` variable. Our *labels* tensor must be a *LongTensor*, and we will need to transpose the tensor so that it matches our other tensors' dimensionality.

In [14]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T


And the labels tensor is simply a clone of the input_ids tensor before masking.

In [15]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [16]:
inputs.labels[:10]

tensor([[  2, 180,   1,  ...,   0,   0,   0],
        [  2, 116, 117,  ...,   0,   0,   0],
        [  2,  86, 164,  ...,   0,   0,   0],
        ...,
        [  2,  86, 123,  ...,   0,   0,   0],
        [  2,   3, 183,  ...,   0,   0,   0],
        [  2, 180,   1,  ...,   0,   0,   0]])

Now we mask tokens in the input_ids tensor using the 15% probability for MLM - ensuring we don't mask CLS, SEP, or PAD tokens.

In [17]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [18]:
mask_arr[0]
# inputs.input_ids.shape[0]

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False,  True, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False])

And now take the indices of each True value within each vector.

In [19]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [20]:
selection[0]

[20, 27]

Then apply these indices to each row in input_ids, assigning each value at these indices a value of 103.

In [21]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

The `inputs` tensors are now ready, and we can begin building the model input pipeline for training. We first create a PyTorch dataset from our data.

In [22]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

Initialize our data using the `MeditationDataset` class.

In [23]:
dataset = MeditationsDataset(inputs)

In [24]:
train_size = int(0.8 * len(dataset))
validation_size = len(dataset) - train_size

train_dataset, validation_dataset = torch.utils.data.random_split(dataset, [train_size, validation_size])

And initialize the dataloader, which we'll be using to load our data into the model during training.

In [25]:
BATCH_SIZE = 8
train_loader      = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [26]:
# 

Now we can move onto setting up the training loop. First we setup GPU/CPU usage.

In [27]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [28]:
from sklearn.metrics import precision_recall_fscore_support , accuracy_score
import numpy as np


Now we can move onto the training loop, we'll train for a couple of epochs (change `epochs` to modify this).

In [36]:
from transformers import AdamW
from tqdm import tqdm  # for our progress bar


# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)



epochs = 1

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    train_loop = tqdm(train_loader, leave=True)
    
    
    predictions_all, ground_truths_all = None, None
    
    # activate training mode
    model.train()
    for N,batch in enumerate(train_loop):

        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        print((outputs.logits.shape))
        print(outputs.logits)
        print(outputs)
        prediction = torch.argmax(outputs.logits, axis=-1)
        print(prediction)
        break
#         prediction = prediction.detach().cpu().numpy().flatten()
#         ground_truth = labels.detach().cpu().numpy().flatten()
        
#         if N==0:
#             predictions_all = prediction
#             ground_truths_all = ground_truth
#         else:
#             predictions_all   = np.concatenate((predictions_all, prediction))
#             ground_truths_all = np.concatenate((ground_truths_all, ground_truth))
            

        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        train_loop.set_description(f'Epoch {epoch}')
        train_loop.set_postfix(loss=loss.item())
#     accuracy = (accuracy_score(ground_truths_all,predictions_all))
#     precision, recall, f1, _ = precision_recall_fscore_support(ground_truths_all,predictions_all, average='binary')
#     print("Training: ", accuracy, precision, recall, f1, _)
    
    
    ### EVAL Validation
    
#     with torch.no_grad():
#         model.eval()
#         v_predictions_all, v_ground_truths_all = None, None
#         validation_loop = tqdm(validation_loader, leave=True)
#         for N,v_batch in enumerate(validation_loop):
#             v_input_ids = v_batch['input_ids'].to(device)
#             v_attention_mask = v_batch['attention_mask'].to(device)
#             v_token_type_ids = v_batch['token_type_ids'].to(device)
#             v_labels = v_batch['labels'].to(device)
#             # process
#             v_outputs = model(v_input_ids, attention_mask=v_attention_mask,
#                             token_type_ids=v_token_type_ids,
#                             labels=v_labels)
#             v_prediction = torch.argmax(v_outputs.logits, axis=-1)
#             v_prediction = v_prediction.detach().cpu().numpy().flatten()
#             v_ground_truth = v_labels.detach().cpu().numpy().flatten()

#             if N==0:
#                 v_predictions_all = v_prediction
#                 v_ground_truths_all = v_ground_truth
#             else:
#                 v_predictions_all   = np.concatenate((v_predictions_all, v_prediction))
#                 v_ground_truths_all = np.concatenate((v_ground_truths_all, v_ground_truth))

#         v_accuracy = (accuracy_score(v_ground_truths_all, v_predictions_all))
#         v_precision, v_recall, v_f1, _ = precision_recall_fscore_support(v_ground_truths_all, 
#                                                                          v_predictions_all, average='binary')
#         print("VALIDATION: ",v_accuracy, v_precision, v_recall, v_f1, _)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|                                                    | 0/11 [00:00<?, ?it/s]

torch.Size([8, 2])
tensor([[ 0.8613, -0.1931],
        [ 0.5191,  0.0642],
        [ 0.9090, -0.1997],
        [ 0.8580, -0.1402],
        [ 0.4152,  0.2810],
        [ 0.6410,  0.1175],
        [ 0.8547, -0.1468],
        [ 0.7451,  0.0344]], device='cuda:0', grad_fn=<AddmmBackward0>)
NextSentencePredictorOutput(loss=tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8613, -0.1931],
        [ 0.5191,  0.0642],
        [ 0.9090, -0.1997],
        [ 0.8580, -0.1402],
        [ 0.4152,  0.2810],
        [ 0.6410,  0.1175],
        [ 0.8547, -0.1468],
        [ 0.7451,  0.0344]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')





In [None]:
# # Save the trained model weights
# training_model.save_weights("weights/wghts" + str(epoch + 1) + ".ckpt")

In [None]:
print(ground_truths)