# Next Instruction Prediction Training


# DATASET GENERATION

In [1]:
#!/usr/bin/env python3

import sys,os
from elftools.elf.elffile import ELFFile
from elftools.elf.segments import Segment

filePath = './../../binaries/gnuit/src/gitfm'
fh = open(filePath, 'rb')
bin_bytearray = bytearray(fh.read())

In [2]:


from capstone import *

from capstone.x86 import *


address_inst = {}
with open('./data/instruction_clusters.txt', 'w') as data_file:
    with open(filePath, 'rb') as f:
        elf = ELFFile(f)
        dwarfinfo = elf.get_dwarf_info()
        aranges = dwarfinfo.get_aranges()
        print(len(aranges.entries))
    #     for arange in aranges.entries:
    #         print(arange)
        for arange in aranges.entries:

            entry = arange.begin_addr
            exit  = arange.begin_addr + arange.length
            ops = bin_bytearray[entry: exit]

            md = Cs(CS_ARCH_X86, CS_MODE_64)
            md.detail = True
            for inst in md.disasm(ops, entry):

                address_inst[hex(inst.address)] = inst
                data_file.write(inst.mnemonic+" "+inst.op_str+";")
            data_file.write('\n')
    #             print( inst.mnemonic+"  "+inst.op_str)


341


# Creating the pipeline

In [3]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer = BertTokenizer.from_pretrained("./binary-tokenizer")
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
with open('./data/instruction_clusters.txt', 'r') as fp:
    text = fp.read().split('\n')

In [5]:
text[1]

'endbr64 ;push rbp;mov rbp, rsp;mov eax, dword ptr [rip + 0x2b9f5];cmp eax, 6;jle 0x501a;mov eax, dword ptr [rip + 0x2a036];test eax, eax;je 0x5008;mov eax, dword ptr [rip + 0x2b9e4];cmp eax, 0xb;jle 0x501a;mov eax, 1;jmp 0x501f;mov eax, dword ptr [rip + 0x2b9d2];cmp eax, 5;jle 0x501a;mov eax, 1;jmp 0x501f;mov eax, 0;pop rbp;ret ;'

We need to split sentences into consecutive, and non-consecutive sequences.

We have to deal with edge-cases too - for example where there is only a single sentence within a paragraph as with the three examples above (in comparison to below where we can easily split into multiple sentences).

In [6]:
text[51].split(';')

['endbr64 ',
 'push rbp',
 'mov rbp, rsp',
 'sub rsp, 0x10',
 'mov dword ptr [rbp - 4], edi',
 'mov eax, dword ptr [rip + 0x24da4]',
 'cmp eax, 1',
 'jne 0xbc8e',
 'movzx eax, byte ptr [rip + 0x24d8d]',
 'movzx eax, al',
 'sar eax, 6',
 'and eax, 1',
 'cmp dword ptr [rbp - 4], eax',
 'je 0xbd57',
 'cmp dword ptr [rbp - 4], 1',
 'jne 0xbcc1',
 'mov rax, qword ptr [rip + 0x2404d]',
 'test rax, rax',
 'je 0xbd2e',
 'mov rax, qword ptr [rip + 0x2403d]',
 'lea rdx, [rip - 0xc66]',
 'mov esi, 1',
 'mov rdi, rax',
 'call 0x47e0',
 'jmp 0xbd2e',
 'mov rax, qword ptr [rip + 0x23fe0]',
 'test rax, rax',
 'je 0xbce8',
 'mov rax, qword ptr [rip + 0x23fd4]',
 'lea rdx, [rip - 0xc8f]',
 'mov esi, 1',
 'mov rdi, rax',
 'call 0x47e0',
 'mov dword ptr [rip + 0x24d1a], 0',
 'mov dword ptr [rip + 0x24d14], 0',
 'movzx eax, byte ptr [rip + 0x24d06]',
 'and eax, 0xffffffbf',
 'mov byte ptr [rip + 0x24cfd], al',
 'movzx eax, byte ptr [rip + 0x24cf6]',
 'shr al, 7',
 'cmp al, 1',
 'jne 0xbd2e',
 'mov dword p

We'll assign a 50% probability of using the genuine next sentence, and 50% probability of using another random sentence.

To make this simpler, we'll create a *'bag'* of individual sentences to pull from when selecting a random sentence B.

In [7]:
bag = [instruction for instruction_cluster in text for instruction in instruction_cluster.split(';')  if instruction!= '']
bag_size = len(bag)
print(bag_size)

33455


In [8]:
bag

['call 0x4810',
 'endbr64 ',
 'push rbp',
 'mov rbp, rsp',
 'mov eax, dword ptr [rip + 0x2b9f5]',
 'cmp eax, 6',
 'jle 0x501a',
 'mov eax, dword ptr [rip + 0x2a036]',
 'test eax, eax',
 'je 0x5008',
 'mov eax, dword ptr [rip + 0x2b9e4]',
 'cmp eax, 0xb',
 'jle 0x501a',
 'mov eax, 1',
 'jmp 0x501f',
 'mov eax, dword ptr [rip + 0x2b9d2]',
 'cmp eax, 5',
 'jle 0x501a',
 'mov eax, 1',
 'jmp 0x501f',
 'mov eax, 0',
 'pop rbp',
 'ret ',
 'endbr64 ',
 'push rbp',
 'mov rbp, rsp',
 'mov eax, dword ptr [rip + 0x2b8b1]',
 'cmp eax, 1',
 'sete al',
 'movzx eax, al',
 'pop rbp',
 'ret ',
 'endbr64 ',
 'push rbp',
 'mov rbp, rsp',
 'push rbx',
 'sub rsp, 0x38',
 'mov dword ptr [rbp - 0x34], edi',
 'mov dword ptr [rbp - 0x28], 0',
 'mov dword ptr [rbp - 0x24], 0',
 'mov eax, dword ptr [rip + 0x2b97e]',
 'mov dword ptr [rbp - 0x20], eax',
 'mov eax, dword ptr [rip + 0x2b979]',
 'mov dword ptr [rbp - 0x1c], eax',
 'mov eax, 0',
 'call 0xc866',
 'cmp dword ptr [rbp - 0x34], 0',
 'jne 0x5094',
 'mov eax

And now we create our 50/50 NIP training data.

In [9]:
import random

history = []
next_instruction = []
label = []

for instruction_cluster in text:
    instructions = [
        instruction for instruction in instruction_cluster.split(';') if instruction != ''
    ]
    num_instructions = len(instructions)
    if num_instructions > 1:
        start = random.randint(0, num_instructions-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            history.append(instructions[start])
            next_instruction.append(instructions[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            history.append(instructions[start])
            next_instruction.append(bag[index])
            label.append(1)

In [10]:
for i in range(3):
    print(label[i])
    print(history[i] + '\n---')
    print(next_instruction[i] + '\n')

1
jmp 0x501f
---
mov eax, dword ptr [rbp - 0x2c]

1
mov rbp, rsp
---
push rbp

0
movsxd rdx, eax
---
mov eax, dword ptr [rbp - 0x14]



Our data is now ready for tokenization, this time we truncate/pad each token to the same length of *512* tokens.

In [11]:
inputs = tokenizer(history, next_instruction, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

NameError: name 'sentence_a' is not defined

In [None]:
inputs.keys()

We can see that the *token_type_ids* tensors have been built correctly (eg **1** indicating sentence B tokens) by checking the first instance of *token_type_ids*:

In [None]:
inputs.token_type_ids[0]

The **0** tokens following our sentence B tokens correspond to *PAD* tokens.

Alongside this, we need to create a *labels* tensor too - which corresponds to the values contained within our `label` variable. Our *labels* tensor must be a *LongTensor*, and we will need to transpose the tensor so that it matches our other tensors' dimensionality.

In [None]:
inputs['labels'] = torch.LongTensor([label]).T

In [None]:
inputs.labels[:10]

The `inputs` tensors are now ready, and we can begin building the model input pipeline for training. We first create a PyTorch dataset from our data.

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

Initialize our data using the `MeditationDataset` class.

In [None]:
dataset = MeditationsDataset(inputs)

And initialize the dataloader, which we'll be using to load our data into the model during training.

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

Now we can move onto setting up the training loop. First we setup GPU/CPU usage.

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [None]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)

Now we can move onto the training loop, we'll train for a couple of epochs (change `epochs` to modify this).

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 5

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

And there we go, we've fine-tuneed our BERT model using NSP on *Meditations*!