In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Second_Sem Projects/Database Security Project/Dataset/XSS_dataset.csv', encoding='utf-8-sig')
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [4]:
def data2char_index(X, max_len):
    alphabet = " abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    result = [] 
    for data in X:
        mat = []
        for ch in data:
            if ch not in alphabet:
                continue
            mat.append(alphabet.index(ch))
        result.append(mat)   
    X_char = tf.keras.preprocessing.sequence.pad_sequences(np.array(result, dtype=object), padding='post',
                                                           truncating='post', maxlen=max_len)
    return X_char

In [5]:
data = df['Sentence'].values
label = df['Label'].values

trainX, testX, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)

x_train = data2char_index(trainX, max_len=1000)
x_test = data2char_index(testX, max_len=1000)

In [6]:
x_train.shape

(10948, 1000)

In [7]:
x_test.shape

(2738, 1000)

In [8]:
def get_charcnn_model(max_len):
    main_input = tf.keras.layers.Input(shape=(max_len,))
    
    embedder = tf.keras.layers.Embedding(
        input_dim=70,  
        output_dim=80, 
        input_length=max_len,
        trainable=False
    )
    embed = embedder(main_input)
    
    cnn1 = tf.keras.layers.Conv1D(32, 5, padding='same', strides=1, activation='relu')(embed)
    cnn1 = tf.keras.layers.MaxPooling1D(pool_size=12)(cnn1)
    
    cnn2 = tf.keras.layers.Conv1D(32, 10, padding='same', strides=1, activation='relu')(embed)
    cnn2 = tf.keras.layers.MaxPooling1D(pool_size=11)(cnn2)
    
    cnn3 = tf.keras.layers.Conv1D(32, 15, padding='same', strides=1, activation='relu')(embed)
    cnn3 = tf.keras.layers.MaxPooling1D(pool_size=10)(cnn3)
    
    cnn = tf.keras.layers.concatenate([cnn1, cnn2, cnn3], axis=1)
    flat = tf.keras.layers.Flatten()(cnn)
    drop = tf.keras.layers.Dropout(0.2)(flat)
    dense1 = tf.keras.layers.Dense(1024, activation='relu')(drop)
    dense2 = tf.keras.layers.Dense(128, activation='relu')(dense1)
    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(dense2)
    model = tf.keras.Model(inputs=main_input, outputs=main_output)
    return model

In [9]:
model = get_charcnn_model(max_len=1000)
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1000, 80)     5600        ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 1000, 32)     12832       ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 1000, 32)     25632       ['embedding[0][0]']              
                                                                                              

In [None]:
batch_size = 128
num_epoch = 15
model_log = model.fit(
    x_train, 
    y_train,
    batch_size=batch_size,
    epochs=num_epoch,
    verbose=1,
    validation_data=(x_test, y_test)
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
pred = model.predict(x_test)
y_pred = np.int64(pred>0.5)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(" Accuracy : {0} \n Precision : {1} \n Recall : {2}".format(accuracy, precision, recall))

 Accuracy : 0.9992695398100804 
 Precision : 1.0 
 Recall : 0.9986468200270636
