In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_csv('Balanced_Dataset.csv')

In [10]:
del df['Unnamed: 0']
df.shape

(28730, 21)

In [11]:
df.head()

Unnamed: 0,Source Port,Protocol,Flow Duration,Fwd Packet Length Min,Bwd Packet Length Min,Fwd IAT Total,Bwd IAT Total,Fwd PSH Flags,Fwd Packets/s,Min Packet Length,...,ACK Flag Count,URG Flag Count,CWE Flag Count,Init_Win_bytes_forward,min_seg_size_forward,Idle Mean,Idle Max,Idle Min,Inbound,Label
0,947,17,743,440.0,0.0,743.0,0.0,0,156123.822342,440.0,...,0,0,0,-1,8,0.0,0.0,0.0,1,DrDoS_NTP
1,526,17,1562,440.0,0.0,1562.0,0.0,0,23047.37516,440.0,...,0,0,0,-1,-1,0.0,0.0,0.0,1,DrDoS_NTP
2,516,17,1264,440.0,0.0,1264.0,0.0,0,25316.455696,440.0,...,0,0,0,-1,20,0.0,0.0,0.0,1,DrDoS_NTP
3,600,17,196,440.0,0.0,196.0,0.0,0,20408.163265,440.0,...,0,0,0,-1,20,0.0,0.0,0.0,1,DrDoS_NTP
4,803,17,1716,440.0,0.0,1716.0,0.0,0,2331.002331,440.0,...,0,0,0,-1,-1,0.0,0.0,0.0,1,DrDoS_NTP


### Verify if the dataset is balanced

In [12]:
df[' Label'].value_counts()

BENIGN       14365
DrDoS_NTP    14365
Name:  Label, dtype: int64

In [15]:
random_state=1
dep_var = ' Label'
num_classes=0
X = normalize(( df.loc[:, df.columns != dep_var] ).values)
y = df[dep_var]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

### Dataset Split with 33% testing and remaining training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, stratify=y, random_state=random_state)

## Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=random_state)
dt.fit(X_train, y_train)
print('Decision Tree testing accuracy: {:.2f}%'.format(100*dt.score(X_test, y_test)))

Decision Tree testing accuracy: 99.80%


## Random Forest

In [20]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
rf.fit(X_train, y_train)
print('Random Forest testing accuracy: {:.2f}%'.format(100*rf.score(X_test, y_test)))

Random Forest testing accuracy: 99.84%


## K-Nearest Neighbors

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
knn.fit(X_train, y_train)
print('k-Nearest Neighbors testing accuracy: {:.2f}%'.format(100*knn.score(X_test, y_test)))

k-Nearest Neighbors testing accuracy: 99.64%


## Adaboost

In [22]:
adadt = AdaBoostClassifier(base_estimator=dt, n_estimators=100, random_state=random_state)
adadt.fit(X_train, y_train)
print('AdaBoost-Decision Tree testing accuracy: {:.2f}%'.format(100*adadt.score(X_test, y_test)))

AdaBoost-Decision Tree testing accuracy: 99.78%


## Support Vector Machine

In [23]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
print('Support Vector Machine testing accuracy: {:.2f}%'.format(100*clf.score(X_test, y_test)))



Support Vector Machine testing accuracy: 87.81%


# Various ML Classifiers with 10 Cross Validation

In [27]:
from sklearn.model_selection import cross_val_score
dt = DecisionTreeClassifier(random_state=random_state)
rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
adadt = AdaBoostClassifier(base_estimator=dt, n_estimators=100, random_state=random_state)
svma = svm.SVC()
scoresdt = cross_val_score(dt, X, y, cv=10, scoring='precision')
scoresrf = cross_val_score(rf, X, y, cv=10, scoring='precision')
scoresknn = cross_val_score(knn, X, y, cv=10, scoring='precision')
scoresadadt = cross_val_score(adadt, X, y, cv=10, scoring='precision')
scoressvma = cross_val_score(svma, X, y, cv=10, scoring='precision')

print("Accuracy for Decision Tree: %0.2f (+/- %0.2f)" % (scoresdt.mean(), scoresdt.std() * 2))
print("Accuracy for Random Forest: %0.2f (+/- %0.2f)" % (scoresrf.mean(), scoresrf.std() * 2))
print("Accuracy for K-nearest Neighbor: %0.2f (+/- %0.2f)" % (scoresknn.mean(), scoresknn.std() * 2))
print("Accuracy for Ada Boost: %0.2f (+/- %0.2f)" % (scoresadadt.mean(), scoresadadt.std() * 2))
print("Accuracy for Support Vector Machine: %0.2f (+/- %0.2f)" % (scoressvma.mean(), scoressvma.std() * 2))



Accuracy for Decision Tree: 1.00 (+/- 0.01)
Accuracy for Random Forest: 1.00 (+/- 0.00)
Accuracy for K-nearest Neighbor: 1.00 (+/- 0.01)
Accuracy for Ada Boost: 1.00 (+/- 0.01)
Accuracy for Support Vector Machine: 0.81 (+/- 0.05)
