IMPORTS

In [61]:
# Load the Pandas libraries with alias 'pd' 
import pandas as pd 
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.feature_selection import VarianceThreshold
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

READING DATASET AND EXAMINING LABEL


In [62]:
dataset = pd.read_csv("c:/Darknet_all.csv" , low_memory=False) 
df = pd.DataFrame(dataset)
display(df)
print (df['Label'].value_counts(ascending=True))



Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,10.152.152.11-216.58.220.99-57158-443-6,10.152.152.11,57158,216.58.220.99,443,6,229,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
1,10.152.152.11-216.58.220.99-57159-443-6,10.152.152.11,57159,216.58.220.99,443,6,407,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
2,10.152.152.11-216.58.220.99-57160-443-6,10.152.152.11,57160,216.58.220.99,443,6,431,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
3,10.152.152.11-74.125.136.120-49134-443-6,10.152.152.11,49134,74.125.136.120,443,6,359,1,1,0,...,20,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,AUDIO-STREAMING
4,10.152.152.11-173.194.65.127-34697-19305-6,10.152.152.11,34697,173.194.65.127,19305,6,10778451,591,400,64530,...,20,0,0,0,0,1.440000e+15,3.117718e+06,1.440000e+15,1.440000e+15,AUDIO-STREAMING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141525,10.8.8.246-224.0.0.252-55219-5355-17,10.8.8.246,55219,224.0.0.252,5355,17,411806,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP
141526,10.8.8.246-224.0.0.252-64207-5355-17,10.8.8.246,64207,224.0.0.252,5355,17,411574,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP
141527,10.8.8.246-224.0.0.252-61115-5355-17,10.8.8.246,61115,224.0.0.252,5355,17,422299,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP
141528,10.8.8.246-224.0.0.252-64790-5355-17,10.8.8.246,64790,224.0.0.252,5355,17,411855,2,0,44,...,8,0,0,0,0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,VOIP


File-transfer         84
Video-streaming      281
AUDIO-STREAMING     1484
VOIP                3566
Email               6145
Video-Streaming     9486
File-Transfer      11098
Chat               11478
Audio-Streaming    16580
Browsing           32808
P2P                48520
Name: Label, dtype: int64


After examining the Label values , we found out some duplicate classes so we dropped them .

In [63]:



dup_values = ['Video-streaming', 'AUDIO-STREAMING','File-transfer' ]
df = df[df.Label.isin(dup_values) == False]    #returing df without duplicated classes
print (df['Label'].value_counts(ascending=True))




VOIP                3566
Email               6145
Video-Streaming     9486
File-Transfer      11098
Chat               11478
Audio-Streaming    16580
Browsing           32808
P2P                48520
Name: Label, dtype: int64


Missing data is data which is not available ( NULL) or infinite values , we will remove the rows which contain any missing data. This shall not affect the model as the dataset is big enough.

In [64]:


df.replace([np.inf, -np.inf], np.nan, inplace=True) #replace infinity values with NaN
df.dropna(inplace=True) #dropping rows with missing values  
print("Dataset size after removal : " ,df.shape)




Dataset size after removal :  (139646, 83)


Dropping unneccassory features like id ,and converting ips to binary then int..

In [65]:
df.drop('Flow ID', axis=1, inplace=True)


i=0
for ip in df['Src IP'].values:
    z = 0
    parts = ip.split('.')
    z = (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
    ip= z
    df['Src IP'].values[i]=ip
    i=i+1



i=0    
for ip in df['Dst IP'].values:
    z = 0
    parts = ip.split('.')
    z = (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])
    ip= z
    df['Dst IP'].values[i]=ip
    i=i+1

Removing 0 variance features..

In [66]:



df_nolabel = df.iloc[:,:-1].values

sel = VarianceThreshold(threshold=0)    
sel.fit_transform(df_nolabel)
arr=sel.get_support()

arr1=np.argwhere(arr == 0)

print("Dataset size before removal " ,df.shape ,"\n")
features_to_remove=[]
for i in arr1 :
    features_to_remove.append((df.columns[i].values[0]))

print("0 variance features : " ,features_to_remove ,"\n")
df.drop(features_to_remove, axis=1, inplace=True)
print("Dataset size after removal " ,df.shape , "\n")

Dataset size before removal  (139646, 82) 

0 variance features :  ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Fwd Bytes/Bulk Avg', 'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg', 'Bwd Bytes/Bulk Avg', 'Subflow Bwd Packets', 'Active Mean', 'Active Std', 'Active Max', 'Active Min'] 

Dataset size after removal  (139646, 67) 



Features scaling and splitting the dataset into training and testing subsets.

In [67]:
count=df.shape[1]
x = df.iloc[:,:-1].values
y = df.iloc[:, count-1].values

scaler = StandardScaler().fit(x)
x = scaler.transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)



SMOTE technique to overcome dataset imbalance ..

In [68]:

counter = Counter(y_train) 
print('Before', counter) 
smtom = SMOTEENN () 
X_train_smtom, y_train_smtom = smtom.fit_resample (X_train, y_train)
counter = Counter(y_train_smtom) 
print('After', counter)


Before Counter({'P2P': 33900, 'Browsing': 23008, 'Audio-Streaming': 11571, 'Chat': 8106, 'File-Transfer': 7789, 'Video-Streaming': 6593, 'Email': 4312, 'VOIP': 2473})
After Counter({'P2P': 33060, 'Audio-Streaming': 30184, 'File-Transfer': 29642, 'Browsing': 28827, 'VOIP': 28514, 'Video-Streaming': 28160, 'Email': 27754, 'Chat': 26653})


In [69]:

# Train Decision Tree Classifer
clf = DecisionTreeClassifier()
clf = clf.fit(X_train_smtom, y_train_smtom)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))



                 precision    recall  f1-score   support

Audio-Streaming       0.94      0.86      0.90      5009
       Browsing       0.96      0.92      0.94      9800
           Chat       0.87      0.63      0.73      3367
          Email       0.59      0.70      0.64      1831
  File-Transfer       0.81      0.85      0.83      3300
            P2P       1.00      1.00      1.00     14620
           VOIP       0.56      0.83      0.67      1093
Video-Streaming       0.67      0.82      0.74      2874

       accuracy                           0.89     41894
      macro avg       0.80      0.83      0.81     41894
   weighted avg       0.90      0.89      0.90     41894



In [70]:


#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train_smtom, y_train_smtom)

#Predict the response for test dataset
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))


                 precision    recall  f1-score   support

Audio-Streaming       0.91      0.84      0.88      5009
       Browsing       0.93      0.86      0.89      9800
           Chat       0.80      0.69      0.74      3367
          Email       0.60      0.69      0.65      1831
  File-Transfer       0.69      0.79      0.73      3300
            P2P       0.99      0.98      0.99     14620
           VOIP       0.56      0.70      0.63      1093
Video-Streaming       0.58      0.72      0.64      2874

       accuracy                           0.86     41894
      macro avg       0.76      0.78      0.77     41894
   weighted avg       0.87      0.86      0.86     41894



In [71]:

clf=RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train_smtom, y_train_smtom)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

Audio-Streaming       0.94      0.86      0.90      5009
       Browsing       0.96      0.92      0.94      9800
           Chat       0.86      0.63      0.73      3367
          Email       0.58      0.70      0.64      1831
  File-Transfer       0.81      0.85      0.83      3300
            P2P       1.00      1.00      1.00     14620
           VOIP       0.57      0.83      0.68      1093
Video-Streaming       0.67      0.82      0.74      2874

       accuracy                           0.89     41894
      macro avg       0.80      0.83      0.81     41894
   weighted avg       0.90      0.89      0.90     41894

