# Playground for Transformers!

### Attention is all you need 
(https://arxiv.org/abs/1706.03762)

### For software vulnerability detection GYM

This is a minimal example of this **CRAZY** idea!

### Note

* LSTM is working with:
    * Adam LR = 0.01
    * Overfitted on 14 epoch. 100% training accuracy.
    * Worked on 1K dataset sample
    * Using the `hidden`n or `cell` output from LSTM. Not the `output`.
    * Bidrectional (2 layers)
* Transformer:
    * Still trying to find the right combination

### Dataset

Import

In [1]:
import numpy as np
import pandas as pd
import os
import torch
import re
import torch.backends.cudnn as cudnn
import torchtext
import matplotlib.pyplot as plt
import time
from torch.autograd import Variable
from torch import nn, optim
from torch.optim import SGD,Adam
import torch.nn.functional as F
import random
from gensim.models.word2vec import Word2Vec
import torchtext.vocab as vocab

In [2]:
seed = 1234
torch.manual_seed(seed)
#device = torch.device("cpu");
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
cudnn.benchmark = True
cudnn.enabled = True
print(device)

cuda


Load playset dataset

In [2]:
dataset = pd.read_pickle('playset(0.25.2).pickle')

View dataset

In [3]:
dataset

Unnamed: 0,functionSource,combine
93792,"go_file_opener_open (GOFileOpener const *fo, g...",False
79770,updatePathMap(bool left_level) {\n\tPoint from...,False
66999,interpret_tilde(const char* path) {\n stati...,False
44284,"checkVarExp(\n Absyn *node,\n Ta...",True
49515,will_have_skip_worktree(const struct cache_ent...,True
...,...,...
96701,"AVLTree_insert(AVLTree * tree, void * data)\n{...",False
67815,"remove_hook(const char *name, hookfn fn)\n{\n\...",False
88363,"output_def(dico_stream_t str, struct gcide_db ...",False
65929,getState(\n\t\tFLMUINT\t\tuiFieldID)\n\t{\n\t\...,False


In [4]:
dataset.describe(include='all')

Unnamed: 0,functionSource,combine
count,100000,100000
unique,100000,2
top,v_identifier(void)\n#else\nv_identifier()\n#en...,True
freq,1,50000


In [5]:
dataset.functionSource[1]

'checkCapture2() const\n{\n    for(int i=6;i<48;i++)\n    {\n        switch(board[i])\n        {\n        case MAN2:\n            if(board[i+5]==MAN1 || board[i+5]==KING1)\n                if(board[i+10]==FREE) return true;\n            if(board[i+6]==MAN1 || board[i+6]==KING1)\n                if(board[i+12]==FREE) return true;\n            break;\n        case KING2:\n            if(board[i-6]==MAN1 || board[i-6]==KING1)\n                if(board[i-12]==FREE) return true;\n            if(board[i-5]==MAN1 || board[i-5]==KING1)\n                if(board[i-10]==FREE) return true;\n            if(board[i+5]==MAN1 || board[i+5]==KING1)\n                if(board[i+10]==FREE) return true;\n            if(board[i+6]==MAN1 || board[i+6]==KING1)\n                if(board[i+12]==FREE) return true;\n        }\n    }\n    return false;\n}'

### Prepare data (in JSON)

#### Clean & Preprocessing

In [6]:
def clean(code):
    ## Remove newlines & tabs
    code = re.sub('(\n)|(\\\\n)|(\\\\)|(\\t)|(/)|(\\r)|(\")|(\')','',code)
    ## Remove code comments
    code = re.sub(r'/\*(.|[\r\n])*?\*/','',code)
    ## Mix split (characters and words)
    splitter = ' +|(;)|(\()|(==)|(\))|(=)|(\+)|(\-)|(\[)|(\])|(<)|(>)|({)|(#)'
    code = re.split(splitter,code)
    ## Remove None type
    code = list(filter(None, code))
    code = list(filter(str.strip, code))
    code = " ".join(code)
    ## Return list of tokens
    return(code)

In [7]:
## Clean the codes
dataset.functionSource = dataset.functionSource.apply(clean)

In [8]:
## Change labels boolean to 1 and 0
dataset.iloc[:,1] = np.multiply(dataset.iloc[:,1],1)

In [9]:
## Change column name
dataset = dataset.rename(columns={'functionSource':'codes', 'combine':'label'})

In [10]:
dataset.codes[24492]

'CardPowerOff ( reader* globalData, char socket ) { char cmd [ 4 ] , ack ; int retVal, actual, retryTimes = 2 ; # ifdef ASE_DEBUG syslog ( LOG_INFO, CardPowerOff - Enter ) ; # endif if ( ( retVal = cardCommandInit ( globalData, socket, 1 ) ) ) return retVal ; cmd [ 0 ] = ASE_PACKET_TYPE ( 0x50, globalData - > commandCounter, socket ) ; globalData - > commandCounter + + ; globalData - > commandCounter % = 4 ; cmd [ 1 ] = 0x21 ; cmd [ 2 ] = 0x0 ; cmd [ 3 ] = cmd [ 0 ] ^ cmd [ 1 ] ^ cmd [ 2 ] ; do { lock_mutex ( globalData ) ; retVal = sendControlCommand ( globalData, socket, cmd, 4, &ack, &actual, 0 ) ; unlock_mutex ( globalData ) ; retryTimes - - ; } while ( retVal ! = ASE_OK && retryTimes ) ; if during the 3 tries the command failed, return an error status if ( retVal < 0 ) { return retVal ; } if ( ack ! = 0x20 ) { return parseStatus ( ack ) ; } * if the card is present, change the status to powered off * if ( globalData - > cards [ ( int ) socket ] .status ) globalData - > cards [ ( i

In [11]:
false = dataset[dataset.iloc[:,1]==0]
true = dataset[dataset.iloc[:,1]==1]

In [12]:
## Split to train,test,valid
train = false[0:400].append(true[0:400])
test  = false[400:400].append(true[400:450])
valid = false[450:500].append(true[400:500])

## Shuffle
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)
valid = valid.sample(frac=1).reset_index(drop=True)

In [None]:
## Save to json
train.to_json('.data/train_1k.json', orient='records',lines=True)
test.to_json('.data/test_1k.json', orient='records',lines=True)
valid.to_json('.data/valid_1k.json', orient='records',lines=True)

In [3]:
## Define the field

CODES = torchtext.data.Field(batch_first=True)
LABEL = torchtext.data.LabelField(dtype=torch.int64)
fields = {'codes': ('codes', CODES), 'label': ('label', LABEL)}

In [4]:
## Import the 1K data as TabularDataset
train_data, valid_data, test_data = torchtext.data.TabularDataset.splits(
                                        path = '.data',
                                        train = 'train_1k.json',
                                        validation = 'valid_1k.json',
                                        test = 'test_1k.json',
                                        format = 'json',
                                        fields = fields)

In [3]:
## Import the min data as TabularDataset
train_data, valid_data, test_data = torchtext.data.TabularDataset.splits(
                                        path = '.data',
                                        train = 'train_min.json',
                                        validation = 'valid_min.json',
                                        test = 'test_min.json',
                                        format = 'json',
                                        fields = fields)

In [None]:
## Import the ALL data as TabularDataset
train_data, valid_data, test_data = torchtext.data.TabularDataset.splits(
                                        path = '.data',
                                        train = 'train_all.json',
                                        validation = 'valid_all.json',
                                        test = 'test_all.json',
                                        format = 'json',
                                        fields = fields)

In [7]:
## Testing
print(vars(valid_data[0]))

{'codes': ['icalparser_get_line', '(', 'icalparser', '*parser,', 'char*', '(', '*line_gen_func', ')', '(', 'char', '*s,', 'size_t', 'size,', 'void', '*d', ')', ')', '{', 'char', '*line', ';', 'char', '*line_p', ';', 'size_t', 'buf_size', '=', 'parser', '-', '>', 'tmp_buf_size', ';', 'line_p', '=', 'line', '=', 'icalmemory_new_buffer', '(', 'buf_size', ')', ';', 'line', '[', '0', ']', '=', '0', ';', '*', 'Read', 'lines', 'by', 'calling', 'line_gen_func', 'and', 'putting', 'the', 'data', 'into', 'parser', '-', '>', 'temp.', 'If', 'the', 'line', 'is', 'a', 'continuation', 'line', '(', 'begins', 'with', 'a', 'space', 'after', 'a', 'newline', ')', 'then', 'append', 'the', 'data', 'onto', 'line', 'and', 'read', 'again.', 'Otherwise,', 'exit', 'the', 'loop.', '*', 'while', '(', '1', ')', '{', '*', 'The', 'first', 'part', 'of', 'the', 'loop', 'deals', 'with', 'the', 'temp', 'buffer,', 'which', 'was', 'read', 'on', 'he', 'last', 'pass', 'through', 'the', 'loop.', 'The', 'routine', 'is', 'split'

Doneeeeeeeeeeeeeeeee !

### Vocabulary-related preparation

In [5]:
##### Build the vocabulary

MAX_VOCAB_SIZE = 10000

CODES.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [6]:
print(f"Unique tokens in TEXT vocabulary: {len(CODES.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 2


In [7]:
## Most common word
print(CODES.vocab.freqs.most_common(100))

[('(', 14063), (')', 14060), (';', 13060), ('-', 6902), ('=', 6370), ('>', 4926), ('{', 3205), ('*', 2803), ('if', 2626), ('}', 2099), ('0', 2017), ('+', 1935), ('[', 1827), (']', 1824), ('return', 1450), ('i', 1170), ('1', 1117), ('<', 1031), ('==', 993), ('int', 882), ('the', 847), ('NULL', 776), (',', 728), ('char', 655), ('!', 580), ('for', 532), ('else', 524), ('#', 524), ('struct', 523), ('to', 443), ('const', 403), ('case', 371), ('&&', 356), ('a', 346), ('sizeof', 333), ('break', 324), ('ret', 317), ('p', 284), ('of', 269), ('||', 251), ('is', 251), ('len', 241), ('in', 235), ('unsigned', 231), ('&', 221), ('2', 217), ('this', 207), ('data', 201), ('buf', 188), ('size', 186), ('%s', 183), ('we', 175), (':', 175), ('not', 172), ('and', 169), ('while', 166), ('%s,', 165), ('0,', 164), ('name', 163), ('void', 158), ('}}', 153), ('error', 147), ('type', 145), ('fprintf', 143), ('it', 143), ('file', 142), ('n', 136), ('goto', 134), ('f', 134), ('be', 133), ('result', 127), ('1,', 12

In [8]:
print(CODES.vocab.itos[:10])
print(LABEL.vocab.stoi)
print(CODES.vocab.stoi[CODES.pad_token])

['<unk>', '<pad>', '(', ')', ';', '-', '=', '>', '{', '*']
defaultdict(None, {0: 0, 1: 1})
1


In [9]:
## place into iterators
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 64,
    sort = False,
    device = device)

### Prepare Word2Vec (Optional)

In [None]:
corpus = pd.read_json('.data/train.json',orient='records',lines=True)

w2v = Word2Vec(corpus.codes, size=300, workers=16, sg=1, min_count=3)
w2v.save('.data/node_w2v_128')

In [10]:
w2v = Word2Vec.load('.data/node_w2v_128')
w2v.wv.vectors

array([[ 0.04268723, -0.01990928, -0.10372822, ...,  0.34928635,
        -0.24622028, -0.02363101],
       [ 0.06655177, -0.08706249, -0.11346684, ...,  0.2967248 ,
        -0.16500187, -0.10260527],
       [ 0.10059763, -0.0993171 , -0.14234892, ...,  0.3913037 ,
        -0.22237949,  0.02339004],
       ...,
       [-0.0031671 ,  0.01939397, -0.00094254, ..., -0.06062187,
        -0.0873417 ,  0.10190531],
       [-0.03719744,  0.02801778,  0.02174594, ..., -0.05577604,
        -0.07265704, -0.00079473],
       [ 0.01092949, -0.03061507, -0.045645  , ..., -0.06999503,
        -0.15679213,  0.11291362]], dtype=float32)

### Transformer class (with LSTM)

In [12]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=10002,
                                  embedding_dim=104)
        self.encode_layer = nn.TransformerEncoderLayer(d_model=104,
                                                       nhead=8,
                                                       dim_feedforward=1024,
                                                       dropout=0.1,
                                                       activation='relu')
        self.trans_encoder = nn.TransformerEncoder(self.encode_layer,
                                                   num_layers=1)
        self.lstm1 = nn.LSTM(input_size=104,
                            hidden_size=104,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(104*2,2)
    
    def forward(self,x):
        x = self.embed(x)
        x = self.trans_encoder(x)
        output, (hidden, cell) = self.lstm1(x)
        x = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        x = self.fc1(x)
        return(x)
    
model = Transformer()
model.to(device)
print(model)

Transformer(
  (embed): Embedding(10002, 104)
  (encode_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): Linear(in_features=104, out_features=104, bias=True)
    )
    (linear1): Linear(in_features=104, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=1024, out_features=104, bias=True)
    (norm1): LayerNorm((104,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((104,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (trans_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=104, out_features=104, bias=True)
        )
        (linear1): Linear(in_features=104, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_featur

### LSTM class

In [17]:
global k
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()

        self.embed = nn.Embedding(num_embeddings=10002,
                                  embedding_dim=150)
        global k
        k = self.embed
        
        self.lstm1 = nn.LSTM(input_size=150,
                            hidden_size=64,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)
        #self.fc1 = nn.Linear(128,64)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64*2,2)
        
    
    def forward(self,x):
        x = self.embed(x)
        output, (hidden, cell) = self.lstm1(x)
        #x = output[:,-1,:]
        x = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        #x =self.dropout(torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1))
        #x = self.fc1(x)
        #x = F.relu(x)
        x = self.fc2(x)
        #x = F.log_softmax(x,dim=1)
        return(x)
    
model = LSTM()
model.to(device)
print(model)

LSTM(
  (embed): Embedding(10002, 150)
  (lstm1): LSTM(150, 64, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)


### CNN Class

In [10]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=10002,
                                  embedding_dim=128)
        self.cnn1 = nn.Conv2d(in_channels=1,
                              out_channels=20,
                              kernel_size=2)
        self.cnn2 = nn.Conv2d(in_channels=1,
                              out_channels=20,
                              kernel_size=3)
        self.cnn3 = nn.Conv2d(in_channels=1,
                              out_channels=20,
                              kernel_size=4)
        self.fc1 = nn.Linear(60,2)
        #self.fc2 = nn.Linear(30,2)
    
    def forward(self,x):
        
        x = self.embed(x)
        x = x.unsqueeze(1)
        
        x1 = F.relu(self.cnn1(x))
        x1 = F.max_pool2d(x1,2)
        x1 = torch.flatten(x1,start_dim=1)
        
        x2 = F.relu(self.cnn2(x))
        x2 = F.max_pool2d(x2,3)
        x2 = torch.flatten(x2,start_dim=1)
        
        x3 = F.relu(self.cnn3(x))
        x3 = F.max_pool2d(x3,4)
        x3 = torch.flatten(x3,start_dim=1)
        
        x = torch.cat((x1,x2,x3), dim=1)
        self.fc1 = nn.Linear(x.shape[1],2)
        x = self.fc1(x.to(device))
        return(x)
    
model = CNN()
model.to(device)
print(model)

CNN(
  (embed): Embedding(10002, 128)
  (cnn1): Conv2d(1, 20, kernel_size=(2, 2), stride=(1, 1))
  (cnn2): Conv2d(1, 20, kernel_size=(3, 3), stride=(1, 1))
  (cnn3): Conv2d(1, 20, kernel_size=(4, 4), stride=(1, 1))
  (fc1): Linear(in_features=60, out_features=2, bias=True)
)


### Load Word2Vec weights to embeddings

In [14]:
weights = torch.FloatTensor(w2v.wv.vectors)
weights = weights.to(device)
model.embed = model.embed.from_pretrained(weights)
#model.embed = model.embed.weight.data.copy_(weights)

### Parameters

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,710,478 trainable parameters


### Prepare accuracy function

In [13]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [14]:
def softmax_accuracy(probs,all_labels):
    def getClass(x):
        return(x.index(max(x)))
    
    all_labels = all_labels.tolist()
    probs = pd.Series(probs.tolist())
    all_predicted = probs.apply(getClass)
    all_predicted.reset_index(drop=True, inplace=True)
    vc = pd.value_counts(all_predicted == all_labels)
    try:
        acc = vc[1]/len(all_labels)
    except:
        if(vc.index[0]==False):
            acc = 0
        else:
            acc = 1
    return(acc)

### Loss functions

In [15]:
## Define optimizer
#optimizer = SGD(model.parameters(), lr = 0.01)
optimizer = Adam(model.parameters(), lr=0.01)

## Define loss function
#criterion = nn.BCELoss().to(device) ## Sigmoid activation function
#criterion = nn.NLLLoss().to(device) ### Log_softmax activation
criterion = nn.CrossEntropyLoss().to(device) ## No activation function bcs softmax included

### Training

In [None]:
epochs=100
model.train()
for e in range(epochs):
    running_acc = 0
    running_loss = 0
    timer = time.time()

    for batch in train_iterator:
        optimizer.zero_grad()
        output = model(batch.codes)
        loss = criterion(output, batch.label)
        loss.backward()
        optimizer.step()
        acc = softmax_accuracy(output,batch.label)
        running_acc += acc.item()
        running_loss += loss.item()
    else:
        with torch.no_grad():
            running_acc_val = 0
            running_loss_val = 0
            for batch in valid_iterator:
                output_val = model(batch.codes)
                loss_val = criterion(output_val,batch.label)
                acc_val = softmax_accuracy(output_val,batch.label)
                running_acc_val += acc_val.item()
                running_loss_val += loss_val.item()
        
        print("Epoch {} - Training acc: {:.6f} -Training loss: {:.6f} - Val acc: {:.6f} - Val loss: {:.6f} - Time: {:.4f}s".format(e+1, running_acc/len(train_iterator), running_loss/len(train_iterator), running_acc_val/len(valid_iterator), running_loss_val/len(valid_iterator), (time.time()-timer)))

In [18]:
import sklearn.metrics
from sklearn.metrics import confusion_matrix

model.eval()
with torch.no_grad():
    running_acc_test = 0
    running_loss_test = 0
    all_pred=[]
    all_labels=[]
    for batch in train_iterator:
        output_test = model(batch.codes).squeeze(1)
        loss_test = criterion(output_test,batch.label)
        acc_test = softmax_accuracy(output_test,batch.label)
        running_acc_test += acc_test.item()
        running_loss_test += loss_test.item()
        all_pred += output_test.tolist()
        all_labels += batch.label.tolist()


print('Train acc: ',running_acc_test/len(train_iterator))
print('Train loss: ',running_loss_test/len(train_iterator))


def getClass(x):
    return(x.index(max(x)))

probs = pd.Series(all_pred)
all_predicted = probs.apply(getClass)
all_predicted.reset_index(drop=True, inplace=True)
vc = pd.value_counts(all_predicted == all_labels)

confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_predicted)
print('Confusion matrix: \n',confusion)

Train acc:  1.0
Train loss:  0.002189714891406206
Confusion matrix: 
 [[400   0]
 [  0 400]]


### Testing

In [19]:
### SOFTMAX

import sklearn.metrics
from sklearn.metrics import confusion_matrix

model.eval()
with torch.no_grad():
    running_acc_test = 0
    running_loss_test = 0
    all_pred=[]
    all_labels=[]
    for batch in test_iterator:
        output_test = model(batch.codes).squeeze(1)
        loss_test = criterion(output_test,batch.label)
        acc_test = softmax_accuracy(output_test,batch.label)
        running_acc_test += acc_test.item()
        running_loss_test += loss_test.item()
        all_pred += output_test.tolist()
        all_labels += batch.label.tolist()


print('Test acc: ',running_acc_test/len(test_iterator))
print('Test loss: ',running_loss_test/len(test_iterator))


def getClass(x):
    return(x.index(max(x)))

probs = pd.Series(all_pred)
all_predicted = probs.apply(getClass)
all_predicted.reset_index(drop=True, inplace=True)
vc = pd.value_counts(all_predicted == all_labels)

confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_predicted)
print('Confusion matrix: \n',confusion)

Test acc:  0.46
Test loss:  2.782841444015503
Confusion matrix: 
 [[ 0  0]
 [27 23]]


In [None]:
#### BINARY

import sklearn.metrics
from sklearn.metrics import confusion_matrix

with torch.no_grad():
    running_acc_test = 0
    running_loss_test = 0
    all_pred=[]
    all_labels=[]
    for batch in test_iterator:
        batch.codes = batch.codes.to(device)
        batch.label = batch.label.to(device)
        output_test = model(batch.codes).squeeze(1)
        loss_test = criterion(output_test,batch.label)
        acc_test = binary_accuracy(output_test,batch.label)
        running_acc_test += acc_test.item()
        running_loss_test += loss_test.item()
        all_pred += torch.round(output_test).tolist()
        all_labels += batch.label.tolist()


print('Test acc: ',running_acc_test/len(test_iterator))
print('Test loss: ',running_loss_test/len(test_iterator))

confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_pred)
print('Confusion matrix: \n',confusion)

# RESULTS