In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten
from tensorflow.keras.optimizers import SGD

import urllib.parse
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
## Loading data

In [4]:
def loadFile(name):
    filepath = os.path.join(str(os.getcwd()), name)
    with open(filepath,'r') as f:
        data = f.readlines()
    return [str(urllib.parse.unquote(e)) for e in list(set(data))]

In [5]:
badQueries = loadFile('dataset/badqueries.txt')

badCount = len(badQueries)

In [6]:
validQueries = loadFile('dataset/goodqueries.txt')

validCount = len(validQueries)

In [7]:
queries = badQueries + validQueries

In [8]:
print("bad: ", badCount)
print("good: ", validCount)
print("all: ", badCount + validCount)

bad:  44713
good:  1265994
all:  1310707


In [9]:
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]

In [10]:
y = yBad + yGood

In [11]:
## Preparing the dataset

In [12]:
tokenizer = Tokenizer(filters="", lower=False, char_level=True)

In [13]:
X = tokenizer.fit_on_texts(queries)

In [14]:
X = tokenizer.texts_to_matrix(queries)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #splitting data

In [16]:
## Building the model

In [17]:
def gen_model(input_size, hidden_layer_count, hidden_layer_size):
    model = Sequential()
    model.add(Dense(input_size))
    for _ in range(hidden_layer_count):
        model.add(Dense(hidden_layer_size, activation='sigmoid'))
    model.add(Dense(1))
    
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    
    return model

In [18]:
model = gen_model(X.shape[-1], 6, 128)

In [19]:
## Training

In [20]:
def train_model(model, epochs, batch_size, X_train, y_train, X_test, y_test):
    X_train = np.asarray(X_train)
    X_test = np.asarray(X_test)
    y_train = np.asarray(y_train)
    y_test = np.asarray(y_test)

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=3)

    return model, test_loss, test_acc

In [21]:
model, test_loss, test_acc = train_model(model, 5, 32, X_train, y_train, X_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
## Metrics

In [23]:
print("Loss: ", test_loss)
print("Accuracy: ", test_acc)

Loss:  0.00478811701759696
Accuracy:  0.9944304823875427


In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 197)               39006     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               25344     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_6 (Dense)              (None, 128)               1

In [25]:
## Testing

In [44]:
def url_is_bad(url):
    X_url = tokenizer.texts_to_matrix([url])
    return round(max(0., min(1., model.predict(X_url)[0][0])) * 100, 2)

In [51]:
url = "/index.php?q=../../../../../../../../../etc/passwd"
print("Is bad ?", url_is_bad(url), "%")

Is bad ? 100.0 %


In [62]:
url = "/test.php?q=data"
print("Is bad ?", url_is_bad(url), "%")

Is bad ? 24.35 %
