In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("/content/drive/MyDrive/data/preprocessed_data.csv")

In [4]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vulnerable,code,len,vul_code
0,0,0,[1],PUSH1 0x80 PUSH1 0x40 MSTORE CALLVALUE DUP1 IS...,26729.0,"[0, 1, 0, 0, 0, 0]"
1,1,1,[3 2 1],PUSH1 0x80 PUSH1 0x40 MSTORE CALLVALUE DUP1 IS...,64351.0,"[0, 1, 1, 1, 0, 0]"
2,2,2,[4],PUSH1 0x80 PUSH1 0x40 MSTORE CALLVALUE DUP1 IS...,7465.0,"[0, 0, 0, 0, 1, 0]"
3,3,3,[4],PUSH1 0x80 PUSH1 0x40 MSTORE CALLVALUE DUP1 IS...,47062.0,"[0, 0, 0, 0, 1, 0]"
4,4,4,[2 5],PUSH1 0x80 PUSH1 0x40 MSTORE CALLVALUE DUP1 IS...,76061.0,"[0, 0, 1, 0, 0, 1]"


In [6]:
def encode(l):
  d=[]
  for i in l:
    if i in ['[',']',',',' ']:
      continue
    d.append(int(i))
  # print(d)
  return d

In [7]:
y = data['vul_code'].map(encode)

In [8]:
ohe_labels = []
for i in y:
  ohe_labels.append(i)

In [11]:
labels = np.array(ohe_labels)
labels.shape

(79641, 6)

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from keras.optimizers import Adam


In [13]:
data['code']=data['code'].astype(str) #Converting stream of tokens as string

In [14]:
MAX_WORDS = 41000
SEQ_LEN = 250
EMBEDDING_LAYER_DIM = 100


tokenizer = Tokenizer(num_words=MAX_WORDS,lower=True)
tokenizer.fit_on_texts(data['code']) #in disassembled data
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 663452 unique tokens.


In [15]:
X = tokenizer.texts_to_sequences(data['code']) #tokenizing sequences
X = pad_sequences(X, maxlen=SEQ_LEN)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (79641, 250)


In [21]:
X_train, X_val, Y_train, Y_val = train_test_split(X,labels, test_size = 0.2, random_state = 42) #splitting training data into train + validation
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

(63712, 250) (63712, 6)
(15929, 250) (15929, 6)


In [22]:
adam = Adam(learning_rate=0.0001)

In [23]:
model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_LAYER_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 250, 100)          4100000   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 250, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense_1 (Dense)             (None, 6)                 606       
                                                                 
Total params: 4,181,006
Trainable params: 4,181,006
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
class TestCallback(tf.keras.callbacks.Callback):
    def __init__(self, X_val, Y_val):
        self.X_val = X_val
        self.Y_val= Y_val

    def on_epoch_end(self, epoch, logs):
        print("\nEvaluating model")
        metrics = self.model.evaluate(X_val, Y_val ,verbose=False)
        print(f"Model evaluation => loss : {metrics[0]}  accuracy: {metrics[1]}",)

In [27]:
epochs = 100
batch_size = 64

#training the model alongwith custom callback to evaluate after every epoch
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,callbacks=[TestCallback(X_val,Y_val)])

In [None]:
model_json = model.to_json()
with open('/content/drive/MyDrive/upd_model/model.json','w') as f:
  f.write(model_json)

In [None]:
model.save_weights('/content/drive/MyDrive/upd_model/model.h5')