In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

try:
  os.chdir('./drive/MyDrive/weblog')
except:
  pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
def featureExtract(filepath):

  f = open(filepath)
  a = f.readlines()

  features = []
  feature = None
  
  for i in range(len(a)):
    line = a[i]
    line = line.rstrip()
    if line[:4] in ['GET ', 'POST', 'PUT ']:

      if feature != None:
        features.append(feature)
        feature = None

      chunks = line.split()
      feature = ''

      urls = chunks[1].replace('//', '/').split('/')
      for i in range(len(urls)-1):
        tmp = urls[i] + urls[i+1]
        feature += ' ' + tmp

    elif line.startswith('Content-Length:'):
      body = a[i+2].rstrip().split('&')
      for i in range(len(body)):
        tmp = body[i].split('=')
        feature += ' ' + tmp[0]
        feature += ' ' + tmp[1]

  if feature != None:
    features.append(feature)
    feature = None

  return features

In [4]:
def anomalyExtract(filepath):

  f = open(filepath)
  a = f.readlines()

  judge = []
  pred = None
  
  for i in range(len(a)):

    line = a[i].rstrip()
    if line[:4] in ['GET ', 'POST', 'PUT ']:

      if pred != None:
        judge.append(pred)
        pred = None

      chunks = line.split()
      pred = 0

    elif line.startswith('Host:') and 'localhost:9090' in line:
        pred = 1

  if pred != None:
    judge.append(pred)
    pred = None

  return judge

In [5]:
feature1 = featureExtract('norm_train.txt')
feature2 = featureExtract('norm_test.txt')
feature3 = featureExtract('anomal_train.txt')
feature4 = featureExtract('anomal_test.txt')

In [6]:
anomal1 = anomalyExtract('norm_train.txt')
anomal2 = anomalyExtract('norm_test.txt')
anomal3 = anomalyExtract('anomal_train.txt')
anomal4 = anomalyExtract('anomal_test.txt')

anomal_train = anomal1 + anomal3
anomal_test = anomal2 + anomal4

In [7]:
trainX = feature1 + feature3
trainY = [0] * len(feature1) + [1] * len(feature3)

testX = feature2 + feature4
testY = [0] * len(feature2) + [1] * len(feature4)

In [8]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="word", sublinear_tf=True, ngram_range=(1, 1))
trainX = vectorizer.fit_transform(trainX)
testX = vectorizer.transform(testX)

In [9]:
def train_and_test(model):
  model.fit(trainX, trainY)
  predY = model.predict(testX)

  for i in range(len(predY)):
    predY[i] |= anomal_test[i]

  print("Accuracy:", accuracy_score(testY, predY))
  print("Precision:", precision_score(testY, predY))
  print("Recall:", recall_score(testY, predY))
  print("F1Score:", f1_score(testY, predY))
  print(confusion_matrix(testY, predY))

In [10]:
linear_svm = LinearSVC(C=2)
train_and_test(linear_svm)

Accuracy: 0.9964791615491689
Precision: 0.9979959919839679
Recall: 0.9934171154997008
F1Score: 0.9957012896131161
[[7190   10]
 [  33 4980]]


In [11]:
rfc = RandomForestClassifier(n_estimators=32)
train_and_test(rfc)

Accuracy: 0.970195693113895
Precision: 0.9408306466906884
Recall: 0.9896269698783163
F1Score: 0.9646120941084969
[[6888  312]
 [  52 4961]]


In [12]:
xgb =  XGBClassifier(
            learning_rate=0.3,
            n_estimators=500,
            max_depth=6,
            min_child_weight=1,
            gamma=0.3,
            subsample=0.8,
            colsample_bytree=0.9,
            objective= 'binary:logistic',
            nthread=-1,
            scale_pos_weight=1,
)

train_and_test(xgb)

Accuracy: 0.9667567346270368
Precision: 0.9398510597670422
Recall: 0.9818471972870536
F1Score: 0.960390243902439
[[6885  315]
 [  91 4922]]


In [13]:
lgbm = LGBMClassifier(n_estimators=400, learning_rate=0.3)
train_and_test(lgbm)

Accuracy: 0.9700319331859494
Precision: 0.9431623116536334
Recall: 0.9864352683024137
F1Score: 0.9643135725429017
[[6902  298]
 [  68 4945]]
