In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten
from tensorflow.keras.optimizers import SGD

import urllib.parse
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
## Loading data

In [None]:
def loadFile(name):
    filepath = os.path.join(str(os.getcwd()), name)
    with open(filepath,'r') as f:
        data = f.readlines()
    return [str(urllib.parse.unquote(e)) for e in list(set(data))]

In [None]:
badQueries = loadFile('dataset/badqueries.txt')

badCount = len(badQueries)

In [None]:
validQueries = loadFile('dataset/goodqueries.txt')

validCount = len(validQueries)

In [None]:
queries = badQueries + validQueries

In [None]:
print("bad: ", badCount)
print("good: ", validCount)
print("all: ", badCount + validCount)

In [None]:
yBad = [[0, 1] for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [[1, 0] for i in range(0, len(validQueries))]

In [None]:
y = yBad + yGood

In [None]:
## Preparing the dataset

In [None]:
tokenizer = Tokenizer(filters="", lower=False, char_level=True)

In [None]:
X = tokenizer.fit_on_texts(queries)

In [None]:
X = tokenizer.texts_to_matrix(queries)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #splitting data

In [None]:
## Building the model

In [None]:
def gen_model(input_size, hidden_layer_count, hidden_layer_size):
    model = Sequential()
    model.add(Dense(input_size))
    for _ in range(hidden_layer_count):
        model.add(Dense(hidden_layer_size, activation='sigmoid'))
    model.add(Dense(2))
    
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
model = gen_model(X.shape[-1], 6, 128)

In [None]:
## Training

In [None]:
def train_model(model, epochs, batch_size, X_train, y_train, X_test, y_test):
    X_train = np.asarray(X_train)
    X_test = np.asarray(X_test)
    y_train = np.asarray(y_train)
    y_test = np.asarray(y_test)

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=3)

    return model, test_loss, test_acc

In [None]:
model, test_loss, test_acc = train_model(model, 5, 32, X_train, y_train, X_test, y_test)

In [None]:
## Metrics

In [None]:
print("Loss: ", test_loss)
print("Accuracy: ", test_acc)

In [None]:
model.summary()

In [None]:
## Testing

In [None]:
def url_is_bad(url):
    X_url = tokenizer.texts_to_matrix([url])
    url_prediction = model.predict(X_url)
    return bool(np.where(url_prediction[0] == max(url_prediction[0]))[0][0])

In [None]:
url = "/index.php?q=../../../../../../../../../etc/passwd"
print("Is bad ?", url_is_bad(url))

In [None]:
url = "/test.php?q=data"
print("Is bad ?", url_is_bad(url))