In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
df=pd.read_csv("./dataset.csv")
df

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Label,Info
0,1,0.000000,131.202.240.87,173.252.100.27,TLSv1.2,1181,NON-VPN,Application Data
1,2,0.002719,131.202.240.87,131.202.244.3,DNS,76,NON-VPN,Standard query 0x5dc4 A www.facebook.com
2,3,0.078089,131.202.244.3,131.202.240.87,DNS,183,NON-VPN,Standard query response 0x5dc4 A www.facebook....
3,4,0.078691,131.202.240.87,131.202.244.3,DNS,76,NON-VPN,Standard query 0xd12d AAAA www.facebook.com
4,5,0.079078,131.202.244.3,131.202.240.87,DNS,195,NON-VPN,Standard query response 0xd12d AAAA www.facebo...
...,...,...,...,...,...,...,...,...
1323895,370478,2867.070717,10.8.0.14,173.194.76.127,UDP,143,VPN,51351 > 19305 Len=115
1323896,370479,2867.073160,173.194.76.127,10.8.0.14,UDP,74,VPN,19305 > 51351 Len=46
1323897,370480,2867.073192,173.194.76.127,10.8.0.14,UDP,66,VPN,19305 > 51351 Len=38
1323898,370481,2867.089662,173.194.76.127,10.8.0.14,UDP,143,VPN,19305 > 51351 Len=115


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1323900 entries, 0 to 1323899
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   No.          1323900 non-null  int64  
 1   Time         1323900 non-null  float64
 2   Source       1323900 non-null  object 
 3   Destination  1323900 non-null  object 
 4   Protocol     1323900 non-null  object 
 5   Length       1323900 non-null  int64  
 6   Label        1323900 non-null  object 
 7   Info         1323884 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 80.8+ MB


In [None]:
df = df.dropna()
df=df.drop(['Info','Destination'], axis=1)
df

Unnamed: 0,No.,Time,Source,Protocol,Length,Label
0,1,0.000000,131.202.240.87,TLSv1.2,1181,NON-VPN
1,2,0.002719,131.202.240.87,DNS,76,NON-VPN
2,3,0.078089,131.202.244.3,DNS,183,NON-VPN
3,4,0.078691,131.202.240.87,DNS,76,NON-VPN
4,5,0.079078,131.202.244.3,DNS,195,NON-VPN
...,...,...,...,...,...,...
1323895,370478,2867.070717,10.8.0.14,UDP,143,VPN
1323896,370479,2867.073160,173.194.76.127,UDP,74,VPN
1323897,370480,2867.073192,173.194.76.127,UDP,66,VPN
1323898,370481,2867.089662,173.194.76.127,UDP,143,VPN


In [None]:
objList = df.select_dtypes(include = "object").columns
objList

Index(['Source', 'Protocol', 'Label'], dtype='object')

In [None]:
le = LabelEncoder()
for feat in objList:
    df[feat] = le.fit_transform(df[feat].astype(str))
print (df.info())
df['Label'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323884 entries, 0 to 1323899
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   No.       1323884 non-null  int64  
 1   Time      1323884 non-null  float64
 2   Source    1323884 non-null  int32  
 3   Protocol  1323884 non-null  int32  
 4   Length    1323884 non-null  int64  
 5   Label     1323884 non-null  int32  
dtypes: float64(1), int32(3), int64(2)
memory usage: 55.6 MB
None


1    662185
0    661699
Name: Label, dtype: int64

In [None]:
df

Unnamed: 0,No.,Time,Source,Protocol,Length,Label
0,1,0.000000,44,38,1181,0
1,2,0.002719,44,6,76,0
2,3,0.078089,103,6,183,0
3,4,0.078691,44,6,76,0
4,5,0.079078,103,6,195,0
...,...,...,...,...,...,...
1323895,370478,2867.070717,3,39,143,1
1323896,370479,2867.073160,172,39,74,1
1323897,370480,2867.073192,172,39,66,1
1323898,370481,2867.089662,172,39,143,1


In [None]:
X=df.drop(['Label'], axis=1)
X.head()

Unnamed: 0,No.,Time,Source,Protocol,Length
0,1,0.0,44,38,1181
1,2,0.002719,44,6,76
2,3,0.078089,103,6,183
3,4,0.078691,44,6,76
4,5,0.079078,103,6,195


In [None]:
Y=df['Label']
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int32

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=50)

In [None]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
X_train

array([[-1.07578122, -0.7474238 , -0.63718752,  0.39646751, -0.8371444 ],
       [ 0.65729603,  0.96639914,  1.06924591,  0.39646751, -0.79458619],
       [-0.75354006, -0.62949601, -0.33993783,  0.39646751,  1.00951185],
       ...,
       [-1.03775425,  0.29579517,  1.79585627, -0.27876845, -0.94631546],
       [-0.40709519, -0.60873799, -0.63718752,  0.39646751,  1.18159505],
       [ 0.95424134,  1.2635523 ,  0.77199621,  0.26142032, -0.86119904]])

In [None]:
from sklearn.decomposition import PCA
  
pca = PCA(n_components = 2)
  
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
  
explained_variance = pca.explained_variance_ratio_

In [None]:
dtree_model=AdaBoostClassifier(random_state=50).fit(X_train,y_train)

In [None]:
dtree_predictions = dtree_model.predict(X_test)#testing

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, dtree_predictions)


print(cm)
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, dtree_predictions))

[[169320  29028]
 [ 15617 183201]]
ACCURACY OF THE MODEL:  0.8875910828217924


In [None]:
from sklearn.metrics import (roc_curve, auc, roc_auc_score,confusion_matrix,classification_report)

print(classification_report(y_test,dtree_predictions))

print( metrics.accuracy_score(y_test, dtree_predictions))

              precision    recall  f1-score   support

           0       0.92      0.85      0.88    198348
           1       0.86      0.92      0.89    198818

    accuracy                           0.89    397166
   macro avg       0.89      0.89      0.89    397166
weighted avg       0.89      0.89      0.89    397166

0.8875910828217924
