# Playground for Transformers!

### Attention is all you need 
(https://arxiv.org/abs/1706.03762)

### For software vulnerability detection GYM

This is a minimal example of this **CRAZY** idea!

### Dataset

Import

In [1]:
import numpy as np
import pandas as pd
import os
import torch
import re
import torch.backends.cudnn as cudnn
import torchtext
import matplotlib.pyplot as plt
import time
from torch.autograd import Variable
from torch import nn, optim
from torch.optim import SGD,Adam
import torch.nn.functional as F
import random

Load playset dataset

In [2]:
dataset = pd.read_pickle('playset(0.25.2).pickle')

View dataset

In [3]:
dataset

Unnamed: 0,functionSource,combine
93792,"go_file_opener_open (GOFileOpener const *fo, g...",False
79770,updatePathMap(bool left_level) {\n\tPoint from...,False
66999,interpret_tilde(const char* path) {\n stati...,False
44284,"checkVarExp(\n Absyn *node,\n Ta...",True
49515,will_have_skip_worktree(const struct cache_ent...,True
...,...,...
96701,"AVLTree_insert(AVLTree * tree, void * data)\n{...",False
67815,"remove_hook(const char *name, hookfn fn)\n{\n\...",False
88363,"output_def(dico_stream_t str, struct gcide_db ...",False
65929,getState(\n\t\tFLMUINT\t\tuiFieldID)\n\t{\n\t\...,False


In [4]:
dataset.describe(include='all')

Unnamed: 0,functionSource,combine
count,100000,100000
unique,100000,2
top,"main(int argc, char *argv[])\n{\n\tchar *outfi...",True
freq,1,50000


In [5]:
dataset.functionSource[1]

'checkCapture2() const\n{\n    for(int i=6;i<48;i++)\n    {\n        switch(board[i])\n        {\n        case MAN2:\n            if(board[i+5]==MAN1 || board[i+5]==KING1)\n                if(board[i+10]==FREE) return true;\n            if(board[i+6]==MAN1 || board[i+6]==KING1)\n                if(board[i+12]==FREE) return true;\n            break;\n        case KING2:\n            if(board[i-6]==MAN1 || board[i-6]==KING1)\n                if(board[i-12]==FREE) return true;\n            if(board[i-5]==MAN1 || board[i-5]==KING1)\n                if(board[i-10]==FREE) return true;\n            if(board[i+5]==MAN1 || board[i+5]==KING1)\n                if(board[i+10]==FREE) return true;\n            if(board[i+6]==MAN1 || board[i+6]==KING1)\n                if(board[i+12]==FREE) return true;\n        }\n    }\n    return false;\n}'

### Prepare data (in JSON)

#### Clean & Preprocessing

In [6]:
def clean(code):
    ## Remove newlines & tabs
    code = re.sub('(\n)|(\\\\n)|(\\\\)|(\\t)|(/)','',code)
    ## Remove code comments
    code = re.sub(r'/\*(.|[\r\n])*?\*/','',code)
    ## Mix split (characters and words)
    splitter = ' +|(;)|(\()|(==)|(\))|(=)|(\+)|(\-)|(\[)|(\])|(<)|(>)|({)|(#)|(\")'
    code = re.split(splitter,code)
    ## Remove None type
    code = list(filter(None, code))
    code = list(filter(str.strip, code))
    ## Return list of tokens
    return(code)

In [7]:
## Clean the codes
dataset.functionSource = dataset.functionSource.apply(clean)

In [8]:
## Change labels boolean to 1 and 0
dataset.iloc[:,1] = np.multiply(dataset.iloc[:,1],1)

In [9]:
## Change column name
dataset = dataset.rename(columns={'functionSource':'codes', 'combine':'label'})

In [10]:
false = dataset[dataset.iloc[:,1]==0]
true = dataset[dataset.iloc[:,1]==1]

In [11]:
## Split to train,test,valid
train = false[0:40000].append(true[0:40000])
test  = false[40000:45000].append(true[40000:45000])
valid = false[45000:50000].append(true[45000:50000])

## Shuffle
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)
valid = valid.sample(frac=1).reset_index(drop=True)

In [12]:
## Save to json
train.to_json('.data/train.json', orient='records',lines=True)
test.to_json('.data/test.json', orient='records',lines=True)
valid.to_json('.data/valid.json', orient='records',lines=True)

In [2]:
## Define the field

CODES = torchtext.data.Field(batch_first=True)
LABEL = torchtext.data.LabelField(dtype=torch.int64)
fields = {'codes': ('codes', CODES), 'label': ('label', LABEL)}

In [3]:
## Import the data as TabularDataset
train_data, valid_data, test_data = torchtext.data.TabularDataset.splits(
                                        path = '.data',
                                        train = 'train.json',
                                        validation = 'valid.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields)

In [4]:
## Testing
print(vars(train_data[0]))

{'codes': ['camel_mime_filter_index_set_name', '(', 'CamelMimeFilterIndex', '*filter,', 'CamelIndexName', '*name', ')', '{', 'g_return_if_fail', '(', 'CAMEL_IS_MIME_FILTER_INDEX', '(', 'filter', ')', ')', ';', 'if', '(', 'name', '!', '=', 'NULL', ')', '{', 'g_return_if_fail', '(', 'CAMEL_IS_INDEX_NAME', '(', 'name', ')', ')', ';', 'g_object_ref', '(', 'name', ')', ';', '}if', '(', 'filter', '-', '>', 'priv', '-', '>', 'name', '!', '=', 'NULL', ')', 'g_object_unref', '(', 'filter', '-', '>', 'priv', '-', '>', 'name', ')', ';', 'filter', '-', '>', 'priv', '-', '>', 'name', '=', 'name', ';', '}'], 'label': 0}


Doneeeeeeeeeeeeeeeee !

### Vocabulary-related preparation

In [5]:
##### Build the vocabulary

MAX_VOCAB_SIZE = 10000

CODES.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [6]:
print(f"Unique tokens in TEXT vocabulary: {len(CODES.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 2


In [7]:
## Most common word
print(CODES.vocab.freqs.most_common(20))

[(')', 1376013), ('(', 1375892), (';', 1270722), ('-', 656492), ('=', 633247), ('>', 507801), ('"', 371009), ('{', 307771), ('*', 257660), ('if', 253646), ('}', 195363), ('+', 187616), ('0', 173061), ('[', 166766), (']', 166621), (',', 130888), ('return', 130564), ('i', 112319), ('==', 106834), ('1', 102418)]


In [8]:
print(CODES.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', ')', '(', ';', '-', '=', '>', '"', '{']
defaultdict(None, {0: 0, 1: 1})


In [9]:
## place into iterators
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 128,
    sort = False)

# Transformer

### Init all the non important stuffs

In [10]:
seed = 1234
torch.manual_seed(seed)
#device = torch.device("cpu");
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
cudnn.benchmark = True
cudnn.enabled = True

### Define Transformer class

In [11]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=10002,
                                  embedding_dim=800)
        self.encode_layer = nn.TransformerEncoderLayer(d_model=800,
                                                       nhead=8,
                                                       dim_feedforward=1024,
                                                       dropout=0.1,
                                                       activation='relu')
        self.trans_encoder = nn.TransformerEncoder(self.encode_layer,
                                                   num_layers=2)
        self.lstm1 = nn.LSTM(input_size=800,
                            hidden_size=128,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)
        self.fc1 = nn.Linear(128*2,2)
    
    def forward(self,x):
        x = self.embed(x)
        x = self.trans_encoder(x)
        r_out, (h_n, h_c) = self.lstm1(x)
        x = r_out[:,-1,:]
        x = self.fc1(x)
        x = F.log_softmax(x,dim=1)
        return(x)
    
model = Transformer()
model.to(device)
print(model)

Transformer(
  (embed): Embedding(10002, 800)
  (encode_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): Linear(in_features=800, out_features=800, bias=True)
    )
    (linear1): Linear(in_features=800, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=1024, out_features=800, bias=True)
    (norm1): LayerNorm((800,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((800,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (trans_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=800, out_features=800, bias=True)
        )
        (linear1): Linear(in_features=800, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_featur

In [12]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=10002,
                                  embedding_dim=50)
        self.lstm1 = nn.LSTM(input_size=50,
                            hidden_size=64,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=False)
        self.fc1 = nn.Linear(64,10)
        self.fc2 = nn.Linear(10,2)
        
    
    def forward(self,x):
        x = self.embed(x)
        r_out, (h_n, h_c) = self.lstm1(x,None)
        x = r_out[:,-1,:]
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        #x = F.log_softmax(x,dim=1)
        return(x)
    
model = Transformer()
model.to(device)
print(model)

Transformer(
  (embed): Embedding(10002, 50)
  (lstm1): LSTM(50, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=2, bias=True)
)


### Prepare accuracy function

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 530,468 trainable parameters


In [14]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [15]:
def softmax_accuracy(probs,all_labels):
    def getClass(x):
        return(x.index(max(x)))
    
    all_labels = all_labels.tolist()
    probs = pd.Series(probs.tolist())
    all_predicted = probs.apply(getClass)
    all_predicted.reset_index(drop=True, inplace=True)
    vc = pd.value_counts(all_predicted == all_labels)
    try:
        acc = vc[1]/len(all_labels)
    except:
        if(vc.index[0]==False):
            acc = 0
        else:
            acc = 1
    return(acc)

### Loss functions

In [16]:
## Define optimizer
#optimizer = SGD(model.parameters(), lr = 0.01, momentum = 0)
optimizer = Adam(model.parameters(), lr = 0.01, eps=1)

## Define loss function
#criterion = nn.BCELoss().to(device) ## Sigmoid activation function
#criterion = nn.NLLLoss().to(device) ### Log_softmax activation
criterion = nn.CrossEntropyLoss().to(device) ## No activation function bcs softmax included

### Training

In [17]:
epochs=50

for e in range(epochs):
    running_acc = 0
    running_loss = 0
    timer = time.time()

    for batch in train_iterator:
        batch.codes = batch.codes.to(device)
        batch.label = batch.label.to(device)
        optimizer.zero_grad()
        output = model(batch.codes)
        loss = criterion(output, batch.label)
        loss.backward()
        optimizer.step()
        acc = softmax_accuracy(output,batch.label)
        running_acc += acc.item()
        running_loss += loss.item()
    else:
        with torch.no_grad():
            running_acc_val = 0
            running_loss_val = 0
            for batch in valid_iterator:
                batch.codes = batch.codes.to(device)
                batch.label = batch.label.to(device)
                output_val = model(batch.codes)
                loss_val = criterion(output_val,batch.label)
                acc_val = softmax_accuracy(output_val,batch.label)
                running_acc_val += acc_val.item()
                running_loss_val += loss_val.item()
        
        print("Epoch {} - Training acc: {:.6f} -Training loss: {:.6f} - Val acc: {:.6f} - Val loss: {:.6f} - Time: {:.4f}s".format(e+1, running_acc/len(train_iterator), running_loss/len(train_iterator), running_acc_val/len(valid_iterator), running_loss_val/len(valid_iterator), (time.time()-timer)))

Epoch 1 - Training acc: 0.498787 -Training loss: 0.693005 - Val acc: 0.500000 - Val loss: 0.692872 - Time: 140.5435s
Epoch 2 - Training acc: 0.503212 -Training loss: 0.692962 - Val acc: 0.500000 - Val loss: 0.692848 - Time: 136.8893s
Epoch 3 - Training acc: 0.499412 -Training loss: 0.692954 - Val acc: 0.503857 - Val loss: 0.692806 - Time: 137.7879s
Epoch 4 - Training acc: 0.503587 -Training loss: 0.692957 - Val acc: 0.500000 - Val loss: 0.692897 - Time: 140.7704s
Epoch 5 - Training acc: 0.498738 -Training loss: 0.692962 - Val acc: 0.500000 - Val loss: 0.692784 - Time: 137.5566s


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "c:\users\hazim\anaconda3\envs\pytorch\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-17-75cc790ac333>", line 14, in <module>
    loss.backward()
  File "c:\users\hazim\anaconda3\envs\pytorch\lib\site-packages\torch\tensor.py", line 195, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "c:\users\hazim\anaconda3\envs\pytorch\lib\site-packages\torch\autograd\__init__.py", line 99, in backward
    allow_unreachable=True)  # allow_unreachable flag
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\hazim\anaconda3\envs\pytorch\lib\site-packages\IPython\core\interactiveshell.py", line 2034, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceb

KeyboardInterrupt: 

### Testing

In [None]:
### SOFTMAX

import sklearn.metrics
from sklearn.metrics import confusion_matrix

with torch.no_grad():
    running_acc_test = 0
    running_loss_test = 0
    all_pred=[]
    all_labels=[]
    for batch in test_iterator:
        batch.codes = batch.codes.to(device)
        batch.label = batch.label.to(device)
        output_test = model(batch.codes).squeeze(1)
        loss_test = criterion(output_test,batch.label)
        acc_test = softmax_accuracy(output_test,batch.label)
        running_acc_test += acc_test.item()
        running_loss_test += loss_test.item()
        all_pred += output_test.tolist()
        all_labels += batch.label.tolist()


print('Test acc: ',running_acc_test/len(test_iterator))
print('Test loss: ',running_loss_test/len(test_iterator))


def getClass(x):
    return(x.index(max(x)))

probs = pd.Series(all_pred)
all_predicted = probs.apply(getClass)
all_predicted.reset_index(drop=True, inplace=True)
vc = pd.value_counts(all_predicted == all_labels)

confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_predicted)
print('Confusion matrix: \n',confusion)

In [None]:
#### BINARY

import sklearn.metrics
from sklearn.metrics import confusion_matrix

with torch.no_grad():
    running_acc_test = 0
    running_loss_test = 0
    all_pred=[]
    all_labels=[]
    for batch in test_iterator:
        batch.codes = batch.codes.to(device)
        batch.label = batch.label.to(device)
        output_test = model(batch.codes).squeeze(1)
        loss_test = criterion(output_test,batch.label)
        acc_test = binary_accuracy(output_test,batch.label)
        running_acc_test += acc_test.item()
        running_loss_test += loss_test.item()
        all_pred += torch.round(output_test).tolist()
        all_labels += batch.label.tolist()


print('Test acc: ',running_acc_test/len(test_iterator))
print('Test loss: ',running_loss_test/len(test_iterator))

confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_pred)
print('Confusion matrix: \n',confusion)

# RESULTS