# Playground for Transformers!

### Attention is all you need 
(https://arxiv.org/abs/1706.03762)

### For software vulnerability detection GYM

This is a minimal example of this **CRAZY** idea!

### Dataset

Import

In [1]:
import numpy as np
import pandas as pd
import os
import torch
import re
import torchtext
import matplotlib.pyplot as plt
import time
from torch.autograd import Variable
from torch import nn, optim
from torch.optim import SGD,Adam
import torch.nn.functional as F
import random

Load playset dataset

In [2]:
dataset = pd.read_pickle('playset(0.25.2).pickle')

View dataset

In [3]:
dataset

Unnamed: 0,functionSource,combine
93792,"go_file_opener_open (GOFileOpener const *fo, g...",False
79770,updatePathMap(bool left_level) {\n\tPoint from...,False
66999,interpret_tilde(const char* path) {\n stati...,False
44284,"checkVarExp(\n Absyn *node,\n Ta...",True
49515,will_have_skip_worktree(const struct cache_ent...,True
...,...,...
96701,"AVLTree_insert(AVLTree * tree, void * data)\n{...",False
67815,"remove_hook(const char *name, hookfn fn)\n{\n\...",False
88363,"output_def(dico_stream_t str, struct gcide_db ...",False
65929,getState(\n\t\tFLMUINT\t\tuiFieldID)\n\t{\n\t\...,False


In [4]:
dataset.describe(include='all')

Unnamed: 0,functionSource,combine
count,100000,100000
unique,100000,2
top,bgrewriteaofCommand(redisClient *c) {\n if ...,True
freq,1,50000


In [5]:
dataset.functionSource[1]

'checkCapture2() const\n{\n    for(int i=6;i<48;i++)\n    {\n        switch(board[i])\n        {\n        case MAN2:\n            if(board[i+5]==MAN1 || board[i+5]==KING1)\n                if(board[i+10]==FREE) return true;\n            if(board[i+6]==MAN1 || board[i+6]==KING1)\n                if(board[i+12]==FREE) return true;\n            break;\n        case KING2:\n            if(board[i-6]==MAN1 || board[i-6]==KING1)\n                if(board[i-12]==FREE) return true;\n            if(board[i-5]==MAN1 || board[i-5]==KING1)\n                if(board[i-10]==FREE) return true;\n            if(board[i+5]==MAN1 || board[i+5]==KING1)\n                if(board[i+10]==FREE) return true;\n            if(board[i+6]==MAN1 || board[i+6]==KING1)\n                if(board[i+12]==FREE) return true;\n        }\n    }\n    return false;\n}'

### Prepare data (in JSON)

#### Clean & Preprocessing

In [6]:
def clean(code):
    ## Remove newlines & tabs
    code = re.sub('(\n)|(\\\\n)|(\\\\)|(\\t)|(/)','',code)
    ## Remove code comments
    code = re.sub(r'/\*(.|[\r\n])*?\*/','',code)
    ## Mix split (characters and words)
    splitter = ' +|(;)|(\()|(==)|(\))|(=)|(\+)|(\-)|(\[)|(\])|(<)|(>)|({)|(#)|(\")'
    code = re.split(splitter,code)
    ## Remove None type
    code = list(filter(None, code))
    ## Return list of tokens
    return(code)

In [7]:
## Clean the codes
dataset.functionSource = dataset.functionSource.apply(clean)

In [8]:
## Change labels boolean to 1 and 0
dataset.iloc[:,1] = np.multiply(dataset.iloc[:,1],1)

In [9]:
## Change column name
dataset = dataset.rename(columns={'functionSource':'codes', 'combine':'label'})

In [10]:
false = dataset[dataset.iloc[:,1]==0]
true = dataset[dataset.iloc[:,1]==1]

In [11]:
## Split to train,test,valid
train = false[0:400].append(true[0:400])
test  = false[400:450].append(true[400:450])
valid = false[450:500].append(true[450:500])

In [12]:
## Save to json
train.to_json('.data/train.json', orient='records',lines=True)
test.to_json('.data/test.json', orient='records',lines=True)
valid.to_json('.data/valid.json', orient='records',lines=True)

In [13]:
## Define the field

CODES = torchtext.data.Field(batch_first=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)
fields = {'codes': ('codes', CODES), 'label': ('label', LABEL)}

In [14]:
## Import the data as TabularDataset
train_data, valid_data, test_data = torchtext.data.TabularDataset.splits(
                                        path = '.data',
                                        train = 'train.json',
                                        validation = 'valid.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields)

In [15]:
## Testing
print(vars(train_data[0]))

{'codes': ['go_file_opener_open', '(', 'GOFileOpener', 'const', '*fo,', 'gchar', 'const', '*opt_enc,', 'GOIOContext', '*io_context,', 'GoView', '*view,', 'GsfInput', '*input', ')', '{', 'g_return_if_fail', '(', 'GO_IS_FILE_OPENER', '(', 'fo', ')', ')', ';', 'g_return_if_fail', '(', 'GSF_IS_INPUT', '(', 'input', ')', ')', ';', 'GO_FILE_OPENER_METHOD', '(', 'fo,', 'open', ')', '(', 'fo,', 'opt_enc,', 'io_context,', 'view,', 'input', ')', ';', '}'], 'label': 0}


Doneeeeeeeeeeeeeeeee !

### Vocabulary-related preparation

In [16]:
##### Build the vocabulary

MAX_VOCAB_SIZE = 5000

CODES.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [17]:
print(f"Unique tokens in TEXT vocabulary: {len(CODES.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 5002
Unique tokens in LABEL vocabulary: 2


In [26]:
## Most common word
print(CODES.vocab.freqs.most_common(20))

[('(', 14063), (')', 14060), (';', 13060), ('-', 6902), ('=', 6370), ('>', 4926), ('"', 4110), ('{', 3205), ('*', 2795), ('if', 2622), ('}', 2080), ('0', 1942), ('+', 1935), ('[', 1827), (']', 1824), ('return', 1446), (',', 1408), ('i', 1170), ('1', 1113), ('<', 1031)]


In [28]:
print(CODES.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', '(', ')', ';', '-', '=', '>', '"', '{']
defaultdict(None, {0: 0, 1: 1})


In [29]:
## place into iterators
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 128)

# Transformer

### Init all the non important stuffs

In [7]:
seed = 1234
torch.manual_seed(seed)
#torch.device("cpu");
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'