In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import urllib.parse
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autotime

time: 147 µs (started: 2021-07-28 21:07:35 +00:00)


In [3]:
## Loading data

time: 6.2 ms (started: 2021-07-28 21:07:35 +00:00)


In [4]:
def loadFile(name):
    filepath = os.path.join(str(os.getcwd()), name)
    with open(filepath,'r') as f:
        data = f.readlines()
    return [str(urllib.parse.unquote(e)) for e in list(set(data))]

time: 1.87 ms (started: 2021-07-28 21:07:35 +00:00)


In [5]:
badQueries = loadFile('dataset/badqueries.txt')

badCount = len(badQueries)

time: 131 ms (started: 2021-07-28 21:07:35 +00:00)


In [6]:
validQueries = loadFile('dataset/goodqueries.txt')

validCount = len(validQueries)

time: 935 ms (started: 2021-07-28 21:07:35 +00:00)


In [7]:
queries = badQueries + validQueries

time: 15.8 ms (started: 2021-07-28 21:07:36 +00:00)


In [8]:
print("bad: ", badCount)
print("good: ", validCount)
print("all: ", badCount + validCount)

bad:  44713
good:  1265994
all:  1310707
time: 7.53 ms (started: 2021-07-28 21:07:36 +00:00)


In [9]:
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]

time: 52.1 ms (started: 2021-07-28 21:07:36 +00:00)


In [10]:
y = yBad + yGood

time: 5.02 ms (started: 2021-07-28 21:07:36 +00:00)


In [11]:
## Preparing the dataset

time: 2.12 ms (started: 2021-07-28 21:07:36 +00:00)


In [12]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3)) #converting data to vectors
X = vectorizer.fit_transform(queries)

time: 37.2 s (started: 2021-07-28 21:07:36 +00:00)


In [13]:
X

<1310707x97832 sparse matrix of type '<class 'numpy.float64'>'
	with 60129598 stored elements in Compressed Sparse Row format>

time: 3.04 ms (started: 2021-07-28 21:08:13 +00:00)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #splitting data

time: 602 ms (started: 2021-07-28 21:08:13 +00:00)


In [15]:
## Training

time: 209 µs (started: 2021-07-28 21:08:14 +00:00)


In [16]:
lgs = LogisticRegression(class_weight={1: 2 * validCount / badCount, 0: 1.0}) # class_weight='balanced')
lgs.fit(X_train, y_train) #training our model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(class_weight={0: 1.0, 1: 56.62755798090041})

time: 29.4 s (started: 2021-07-28 21:08:14 +00:00)


In [17]:
## Metrics

time: 349 µs (started: 2021-07-28 21:08:43 +00:00)


In [18]:
predicted = lgs.predict(X_test)

time: 26.7 ms (started: 2021-07-28 21:08:43 +00:00)


In [19]:
fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

time: 107 ms (started: 2021-07-28 21:08:43 +00:00)


In [20]:
print("Bad samples: %d" % badCount)
print("Good samples: %d" % validCount)
print("Baseline Constant negative: %.6f" % (validCount / (validCount + badCount)))
print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

Bad samples: 44713
Good samples: 1265994
Baseline Constant negative: 0.965886
------------
Accuracy: 0.999371
Precision: 0.984261
Recall: 0.997799
F1-Score: 0.990984
AUC: 0.999971
time: 454 ms (started: 2021-07-28 21:08:43 +00:00)


In [21]:
## Testing

time: 228 µs (started: 2021-07-28 21:08:44 +00:00)


In [22]:
def url_is_bad(url):
    X_url = vectorizer.transform([url])
    return bool(lgs.predict(X_url)[0])

time: 4.54 ms (started: 2021-07-28 21:08:44 +00:00)


In [23]:
url = "/index.php?q=../../../../../../../../../etc/passwd"
print("Is bad ?", url_is_bad(url))

Is bad ? True
time: 3.2 ms (started: 2021-07-28 21:08:44 +00:00)


In [24]:
url = "/test.php?q=data"
print("Is bad ?", url_is_bad(url))

Is bad ? False
time: 2.37 ms (started: 2021-07-28 21:08:44 +00:00)
