# DNS Tunneling Detection

Dataset Credits : https://github.com/chuayupeng/dns-tunnelling-detection

In [14]:
import time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

# Data Processing

In [15]:
data = pd.read_csv("data.csv")
data.sample(frac=1)
target=data['attack']
data.drop([data.columns[0], 'packet', 'attack', 'subdomainCount', 'queryNameLength'], axis=1, inplace=True)
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=50)

In [16]:
def PrintStats(y_pred, test_target):
    print("Number of mislabeled points out of a total %d points : %d"
      % (test_target.shape[0], (test_target != y_pred).sum()))

    test_target = test_target.to_numpy()
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for t in range(test_target.shape[0]):
        if test_target[t] == -1:
            if y_pred[t] == 1:
                fp += 1
            else:
                tn += 1
        else:
            if y_pred[t] == 1:
                tp += 1
            else:
                fn += 1

    print("True Positives :", tp)
    print("True Negatives :", tn)
    print("False Positives :", fp)
    print("False Negatives :", fn)

# Gaussian Naive Bayes Classifier

In [4]:
start = time.time()
gnb = GaussianNB()
y_pred = gnb.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  8.888483047485352 millisec
Number of mislabeled points out of a total 3775 points : 88
True Positives : 3686
True Negatives : 1
False Positives : 88
False Negatives : 0


In [5]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.66887417218543 %


In [6]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.01      0.02        89
           1       0.98      1.00      0.99      3686

    accuracy                           0.98      3775
   macro avg       0.99      0.51      0.51      3775
weighted avg       0.98      0.98      0.97      3775



# Multinomial Naive Bayes Classifier

In [17]:
start = time.time()
MultiNB = MultinomialNB()
y_pred = MultiNB.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  17.17972755432129 millisec
Number of mislabeled points out of a total 3775 points : 89
True Positives : 3686
True Negatives : 0
False Positives : 89
False Negatives : 0


In [18]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.64238410596026 %


In [19]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.49      0.50      0.49      3775
weighted avg       0.95      0.98      0.96      3775



# Bernoulli Naive Bayes Classifier

In [20]:
start = time.time()
BernNB = BernoulliNB(binarize = 0.05)
y_pred = BernNB.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  9.321928024291992 millisec
Number of mislabeled points out of a total 3775 points : 89
True Positives : 3686
True Negatives : 0
False Positives : 89
False Negatives : 0


In [21]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.64238410596026 %


In [22]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.49      0.50      0.49      3775
weighted avg       0.95      0.98      0.96      3775



# Random Forest Classifier

In [13]:
start = time.time()
clf = RandomForestClassifier(n_estimators = 100)  
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  841.4759635925293 millisec
Number of mislabeled points out of a total 3775 points : 22
True Positives : 3683
True Negatives : 70
False Positives : 19
False Negatives : 3


In [14]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  99.41721854304636 %


In [15]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.96      0.79      0.86        89
           1       0.99      1.00      1.00      3686

    accuracy                           0.99      3775
   macro avg       0.98      0.89      0.93      3775
weighted avg       0.99      0.99      0.99      3775



# Decision Tree Classifier

In [16]:
start = time.time()
clf = DecisionTreeClassifier()   
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  16.695737838745117 millisec
Number of mislabeled points out of a total 3775 points : 22
True Positives : 3683
True Negatives : 70
False Positives : 19
False Negatives : 3


In [17]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  99.41721854304636 %


In [18]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.96      0.79      0.86        89
           1       0.99      1.00      1.00      3686

    accuracy                           0.99      3775
   macro avg       0.98      0.89      0.93      3775
weighted avg       0.99      0.99      0.99      3775



# Multi Layer Perceptron

In [19]:
start = time.time()
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 10), random_state=1)
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  3608.0780029296875 millisec
Number of mislabeled points out of a total 3775 points : 94
True Positives : 3679
True Negatives : 2
False Positives : 87
False Negatives : 7


In [20]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.50993377483444 %


In [21]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.22      0.02      0.04        89
           1       0.98      1.00      0.99      3686

    accuracy                           0.98      3775
   macro avg       0.60      0.51      0.51      3775
weighted avg       0.96      0.98      0.97      3775



# Linear Support Vector Machine 

In [23]:
start = time.time()
SVML = LinearSVC()
y_pred = SVML.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  355.60011863708496 millisec
Number of mislabeled points out of a total 3775 points : 89
True Positives : 3686
True Negatives : 0
False Positives : 89
False Negatives : 0


In [24]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.64238410596026 %


In [25]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.49      0.50      0.49      3775
weighted avg       0.95      0.98      0.96      3775



# Quadratic Support Vector Machine

In [26]:
start = time.time()
SVMQ = SVC(kernel='poly', degree=2, gamma='scale')
y_pred = SVMQ.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  9490.642786026001 millisec
Number of mislabeled points out of a total 3775 points : 89
True Positives : 3686
True Negatives : 0
False Positives : 89
False Negatives : 0


In [27]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.64238410596026 %


In [28]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.49      0.50      0.49      3775
weighted avg       0.95      0.98      0.96      3775



# K Nearest Neighbours

In [28]:
start = time.time()
KNN = KNeighborsClassifier(n_neighbors = 2)
y_pred = KNN.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  234.28940773010254 millisec
Number of mislabeled points out of a total 3775 points : 47
True Positives : 3646
True Negatives : 82
False Positives : 7
False Negatives : 40


In [29]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  98.75496688741721 %


In [30]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.67      0.92      0.78        89
           1       1.00      0.99      0.99      3686

    accuracy                           0.99      3775
   macro avg       0.84      0.96      0.89      3775
weighted avg       0.99      0.99      0.99      3775

