IMPORTS

In [4]:
# Load the Pandas libraries with alias 'pd' 
import pandas as pd 
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.feature_selection import VarianceThreshold
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

READING DATASET AND EXAMINING LABEL


In [5]:
dataset = pd.read_csv("c:/Darknet_all.csv" , low_memory=False) 
df = pd.DataFrame(dataset)
display(df)
print (df['Label'].value_counts(ascending=True))



Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,10.152.152.11-216.58.220.99-57158-443-6,10.152.152.11,57158,216.58.220.99,443,6,229,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
1,10.152.152.11-216.58.220.99-57159-443-6,10.152.152.11,57159,216.58.220.99,443,6,407,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
2,10.152.152.11-216.58.220.99-57160-443-6,10.152.152.11,57160,216.58.220.99,443,6,431,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
3,10.152.152.11-74.125.136.120-49134-443-6,10.152.152.11,49134,74.125.136.120,443,6,359,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
4,10.152.152.11-173.194.65.127-34697-19305-6,10.152.152.11,34697,173.194.65.127,19305,6,10778451,591,400,64530,...,20,0,0,0,0,1.440000e+15,3.117718e+06,1.440000e+15,1.440000e+15,AUDIO-STREAMING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141525,10.8.8.246-224.0.0.252-55219-5355-17,10.8.8.246,55219,224.0.0.252,5355,17,411806,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP
141526,10.8.8.246-224.0.0.252-64207-5355-17,10.8.8.246,64207,224.0.0.252,5355,17,411574,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP
141527,10.8.8.246-224.0.0.252-61115-5355-17,10.8.8.246,61115,224.0.0.252,5355,17,422299,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP
141528,10.8.8.246-224.0.0.252-64790-5355-17,10.8.8.246,64790,224.0.0.252,5355,17,411855,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP


File-transfer         84
Video-streaming      281
AUDIO-STREAMING     1484
VOIP                3566
Email               6145
Video-Streaming     9486
File-Transfer      11098
Chat               11478
Audio-Streaming    16580
Browsing           32808
P2P                48520
Name: Label, dtype: int64


After examining the Label values , we found out some duplicate classes so we dropped them .

In [6]:



dup_values = ['Video-streaming', 'AUDIO-STREAMING','File-transfer' ]
df = df[df.Label.isin(dup_values) == False]    #returing df without duplicated classes
print (df['Label'].value_counts(ascending=True))




VOIP                3566
Email               6145
Video-Streaming     9486
File-Transfer      11098
Chat               11478
Audio-Streaming    16580
Browsing           32808
P2P                48520
Name: Label, dtype: int64


Missing data is data which is not available ( NULL) or infinite values , we will remove the rows which contain any missing data. This shall not affect the model as the dataset is big enough.

In [7]:


df.replace([np.inf, -np.inf], np.nan, inplace=True) #replace infinity values with NaN
df.dropna(inplace=True) #dropping rows with missing values  
print("Dataset size after removal : " ,df.shape)




Dataset size after removal :  (139646, 83)


Dropping unneccassory features like id ,and converting ips to binary then int..

In [8]:
df.drop('Flow ID', axis=1, inplace=True)


i=0
for ip in df['Src IP'].values:
    z = 0
    parts = ip.split('.')
    z = (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
    ip= z
    df['Src IP'].values[i]=ip
    i=i+1



i=0    
for ip in df['Dst IP'].values:
    z = 0
    parts = ip.split('.')
    z = (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
    ip= z
    df['Dst IP'].values[i]=ip
    i=i+1

Removing 0 variance features..

In [9]:



df_nolabel = df.iloc[:,:-1].values

sel = VarianceThreshold(threshold=0)    
sel.fit_transform(df_nolabel)
arr=sel.get_support()

arr1=np.argwhere(arr == 0)

print("Dataset size before removal " ,df.shape ,"\n")
features_to_remove=[]
for i in arr1 :
    features_to_remove.append((df.columns[i].values[0]))

print("0 variance features : " ,features_to_remove ,"\n")
df.drop(features_to_remove, axis=1, inplace=True)
print("Dataset size after removal " ,df.shape , "\n")

Dataset size before removal  (139646, 82) 

0 variance features :  ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Fwd Bytes/Bulk Avg', 'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg', 'Bwd Bytes/Bulk Avg', 'Subflow Bwd Packets', 'Active Mean', 'Active Std', 'Active Max', 'Active Min'] 

Dataset size after removal  (139646, 67) 



Features scaling and splitting the dataset into training and testing subsets.

In [10]:
count=df.shape[1]
x = df.iloc[:,:-1].values
y = df.iloc[:, count-1].values

scaler = StandardScaler().fit(x)
x = scaler.transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)



SMOTE technique to overcome dataset imbalance ..

In [11]:

counter = Counter(y_train) 
print('Before', counter) 
smtom = SMOTEENN () 
X_train_smtom, y_train_smtom = smtom.fit_resample (X_train, y_train)
counter = Counter(y_train_smtom) 
print('After', counter)


Before Counter({'P2P': 33960, 'Browsing': 22930, 'Audio-Streaming': 11662, 'Chat': 8042, 'File-Transfer': 7798, 'Video-Streaming': 6649, 'Email': 4244, 'VOIP': 2467})
After Counter({'P2P': 33122, 'Audio-Streaming': 30126, 'File-Transfer': 29780, 'Browsing': 28806, 'VOIP': 28453, 'Video-Streaming': 28035, 'Email': 27757, 'Chat': 26781})


In [11]:
clf = DecisionTreeClassifier()


from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
    
}

grid_search = GridSearchCV(estimator=clf, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search.fit(X_train_smtom, y_train_smtom)

grid_search.best_estimator_




Fitting 4 folds for each of 50 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  5.1min finished


DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=5)

In [12]:
best_DT=DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=5)


#testing
best_DT = best_DT.fit(X_train_smtom, y_train_smtom)
y_pred = best_DT.predict(X_test)
print(classification_report(y_test, y_pred))

y_train_pred = best_DT.predict(X_train_smtom)
print(classification_report(y_train_smtom, y_train_pred))



                 precision    recall  f1-score   support

Audio-Streaming       0.94      0.85      0.89      4874
       Browsing       0.96      0.91      0.93      9859
           Chat       0.81      0.69      0.74      3483
          Email       0.60      0.71      0.65      1812
  File-Transfer       0.79      0.84      0.81      3365
            P2P       1.00      0.99      0.99     14563
           VOIP       0.60      0.75      0.67      1077
Video-Streaming       0.64      0.82      0.72      2861

       accuracy                           0.89     41894
      macro avg       0.79      0.82      0.80     41894
   weighted avg       0.90      0.89      0.89     41894

                 precision    recall  f1-score   support

Audio-Streaming       0.99      0.95      0.97     30138
       Browsing       0.98      0.98      0.98     28956
           Chat       0.91      0.90      0.91     26713
          Email       0.90      0.91      0.90     27952
  File-Transfer       0.97 

In [12]:

clf=RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train_smtom, y_train_smtom)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

Audio-Streaming       0.95      0.87      0.90      4918
       Browsing       0.96      0.93      0.94      9878
           Chat       0.86      0.63      0.73      3431
          Email       0.61      0.69      0.64      1899
  File-Transfer       0.83      0.84      0.84      3291
            P2P       0.99      1.00      0.99     14560
           VOIP       0.55      0.86      0.67      1099
Video-Streaming       0.66      0.83      0.74      2818

       accuracy                           0.89     41894
      macro avg       0.80      0.83      0.81     41894
   weighted avg       0.90      0.89      0.90     41894



In [13]:
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train_smtom, y_train_smtom)

#Predict the response for test dataset
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))


                 precision    recall  f1-score   support

Audio-Streaming       0.91      0.85      0.88      4918
       Browsing       0.93      0.86      0.89      9878
           Chat       0.80      0.68      0.73      3431
          Email       0.63      0.69      0.66      1899
  File-Transfer       0.70      0.77      0.74      3291
            P2P       0.99      0.98      0.99     14560
           VOIP       0.54      0.76      0.63      1099
Video-Streaming       0.57      0.73      0.64      2818

       accuracy                           0.86     41894
      macro avg       0.76      0.79      0.77     41894
   weighted avg       0.87      0.86      0.86     41894

