In [1]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler

In [2]:
data = pd.read_csv("Darknet.CSV", low_memory=False, parse_dates=["Timestamp"], error_bad_lines=False)

def display_all(df): # tip: you can transpose before giving input!
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        print("The shape is: ", data.shape)
        data.info()
        data.dtypes
        display(df)
        

def display_types(df):
    print(df["Level1"].unique())
    print(df["Level2"].unique())
    print(df.groupby("Level1")["Level2"].unique())
    print(df.groupby("Level1")["Level2"].nunique())
    
# TODO drop VPN
data.drop(["Flow ID"],axis = 1,inplace = True)
data.drop(["Src IP"],axis = 1,inplace = True)
data.drop(["Dst IP"],axis = 1,inplace = True)
data.drop(["Src Port"],axis = 1,inplace = True)
data.drop(["Dst Port"],axis = 1,inplace = True)
data.drop(["Flow Duration"],axis = 1,inplace = True)
data.drop(["Timestamp"],axis = 1,inplace = True)
# data.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)
# data = data.loc[:, internet_ds_test.apply(pd.Series.nunique) != 1]
data.isnull().sum()

data.rename(columns = {"Label" : "Level1", "Label.1" : "Level2"}, inplace = True)
# data.rename({'Level2': {"AUDIO-STREAMING" : "Audio-Streaming", "Audio-streaming" : "Audio-Streaming", "Video-streaming" : "Video-Streaming", "File-transfer" : "File-Transfer"}}, inplace = True)
data['Level2'].loc[data['Level2'] == 'AUDIO-STREAMING'] = 'Audio-Streaming'
data['Level2'].loc[data['Level2'] == 'Audio-streaming'] = 'Audio-Streaming'
data['Level2'].loc[data['Level2'] == 'File-transfer'] = 'File-Transfer'
data['Level2'].loc[data['Level2'] == 'Video-streaming'] = 'Video-Streaming'
data["Level2"].unique()
# samples[real_columns] = samples[real_columns].replace([np.inf, -np.inf], np.nan)
# samples[real_columns] = samples[real_columns].dropna()

Skipping line 328: expected 85 fields, saw 125



array(['Audio-Streaming', 'Browsing', 'Chat', 'Email', 'File-Transfer',
       'P2P', 'Video-Streaming', 'VOIP'], dtype=object)

In [3]:
start_mem = data.memory_usage().sum() / 1024**2 # start mem for comparison later

for col in data.columns:
    col_type = data[col].dtypes
    if col_type in ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
        c_min = data[col].min()
        c_max = data[col].max()
        if str(col_type)[:3] == 'int': # if it's int
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: # if it can be int8, make it int8
                data[col] = data[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: # if it can be int16, make it int16
                data[col] = data[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: # so on
                data[col] = data[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                data[col] = data[col].astype(np.int64)
        else: # if it's float
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: # if it can be float16, make it float16
                data[col] = data[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: # so on
                data[col] = data[col].astype(np.float32)
            else:
                data[col] = data[col].astype(np.float64)

end_mem = data.memory_usage().sum() / 1024**2 # end mem for comparison later

print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

Memory usage after optimization is: 31.31 MB
Decreased by 62.8%


# Split

In [4]:
copied = data.copy()
copied.replace([np.inf, -np.inf], np.nan, inplace=True)
copied = copied.dropna()
copied.isnull().sum().sum()

0

In [5]:
copied = copied[(copied.Level1 == 'Tor') | (copied.Level1 == 'Non-Tor')]  

In [6]:
Y = copied[["Level1", "Level2"]]
X = copied.drop(["Level1", "Level2"], axis=1)

In [7]:
Y.replace('Tor', 1, inplace=True)               #Converting strings to binary classes 0 or 1
Y.replace('Non-Tor', 0, inplace=True)

In [8]:
from sklearn.preprocessing import LabelEncoder

def categorise(row):
    if row['Level1'] == 1:
        return f"Tor {row['Level2']}"
    else:
        return 'Non-Tor'
    

Y['Level2'] = Y.apply(lambda row: categorise(row), axis = 1)

le = LabelEncoder()
Y['Level2'] = le.fit_transform(Y['Level2'])

In [9]:
X_train__test, X_application, Y_train_test, Y_application = train_test_split(X, Y, test_size=0.30, random_state=42, stratify=Y['Level2'])
X_train, X_test, Y_train, Y_test = train_test_split(X_train__test, Y_train_test, test_size=0.30, random_state=42, stratify=Y_train_test['Level2'])

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training and Testing Models
## Layer 1

Undersampling minority classes from layer 2

In [11]:
Y_train_l = Y_train['Level1']
Y_test_l = Y_test['Level1']

In [12]:
Y_train_2 = Y_train['Level2']
Y_test_2 = Y_test['Level2']

In [13]:
np.unique(Y_train_2, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([45721,   110,   129,    31,     6,    53,   108,   146,    99],
       dtype=int64))

In [14]:
sampless = {1: 110, 2: 129, 6:108, 7:146}
samples = [1, 2, 6, 7]
undersample = CondensedNearestNeighbour(sampling_strategy = samples, random_state = 42, n_jobs=-1, n_neighbors=3)
#undersample = RandomUnderSampler(sampling_strategy = sampless, random_state = 42)
#X_train, Y_train_2 = undersample.fit_resample(X_train, Y_train_2)

In [15]:
#Y_train['Level2'].value_counts()

In [16]:
#Creating a target Y for binary after undersampling

Y_train_l1 = Y_train_2.copy()
Y_train_l1[Y_train_l1>1] = 1
Y_train_l.equals(Y_train_l1)

False

In [17]:
#Using only the first classification for the gridsearch
#Y_train_l = Y_train['Level1']
#Y_test_l = Y_test['Level1']

In [18]:
np.info(X_train)

class:  ndarray
shape:  (46403, 76)
strides:  (8, 371224)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x21389f3f040
byteorder:  little
byteswap:  False
type: float64


In [19]:
Y_train.dropna(inplace=True)

In [20]:
Y_train.shape

(46403, 2)

In [21]:
#importing libraries for models
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from imblearn.pipeline import Pipeline as impip
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold


#Creating classifiers to be passed to the pipeline
clf1 = RandomForestClassifier(random_state=42, criterion = 'entropy')
clf2 = ExtraTreesClassifier(random_state=42, criterion = 'entropy')
clf3 = DecisionTreeClassifier(random_state=42, criterion = 'entropy')
clf4 = GradientBoostingClassifier(random_state=42)
clf5 = AdaBoostClassifier(random_state=42)
clf6 = BaggingClassifier(random_state=42)
clf7 = SVC(random_state=42)

#Initializing parameter dictionary for models

param1 = {}
param1['classifier__n_estimators'] = [10, 50, 100]
param1['classifier__max_depth'] = [5, 10, 20]
param1['classifier__min_samples_split'] = [2,5,10]
param1['classifier'] = [clf1]

param2 = {}
param2['classifier__n_estimators'] = [10, 50, 100]
param2['classifier__max_depth'] = [5, 10, 20]
param2['classifier__min_samples_split'] = [2,5,10]
param2['classifier'] = [clf2]

param3 = {}
param3['classifier__max_depth'] = [5, 10, 20]
param3['classifier__min_samples_split'] = [2,5,10]
param3['classifier'] = [clf3]

param4 = {}
param4['classifier__n_estimators'] = [10, 50, 100]
param4['classifier__learning_rate'] = [0.01, 0.05, 0.1]
param4['classifier__max_depth'] = [5, 10, 20]
param3['classifier__min_samples_split'] = [2,5,10]
param4['classifier'] = [clf4]

param5 = {}
param5['classifier__n_estimators'] = [10, 50, 100]
param5['classifier__learning_rate'] = [0.01, 0.05, 0.1]
param5['classifier'] = [clf5]

param6 = {}
param6['classifier__n_estimators'] = [10, 50, 100]
param6['classifier__max_samples'] = [0.05, 0.1, 0.2, 0.5]
param6['classifier'] = [clf6]

param7 = {}
param7['classifier__C'] = [0.01, 0.1, 1, 5]
param7['classifier__kernel'] = ['linear', 'rbf', 'sigmoid']
param7['classifier'] = [clf7]

params = [param1, param2, param3, param4, param5, param6, param7]

In [22]:
from sklearn.metrics import accuracy_score, f1_score, classification_report,confusion_matrix

In [23]:
#Creating random_state=ne for the models
# pipeline = impip([('scaler', StandardScaler()),('classifier', clf1),])

# #implementing randomized search because gridsearch takes forever
# rs = RandomizedSearchCV(pipeline, params, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), n_jobs=-1, scoring='f1', error_score='raise').fit(X_train, Y_train_l)
# rs.best_params_

In [24]:
# random Forest worked best
ranForModel = DecisionTreeClassifier(max_depth=10, min_samples_split=2, random_state=42, criterion = "entropy")
ranForModel.fit(X_train, Y_train_l)
prediction_1 = ranForModel.predict(X_test)
Accuracy = accuracy_score(Y_test_l, prediction_1)
print("F1 score",f1_score(Y_test['Level1'], prediction_1, average=None))
print(classification_report(Y_test['Level1'], prediction_1, target_names=['0','1']))
print('Confusion Matrix \n' + str(confusion_matrix(Y_test['Level1'], prediction_1)))

F1 score [0.99997448 0.9982906 ]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19595
           1       1.00      1.00      1.00       292

    accuracy                           1.00     19887
   macro avg       1.00      1.00      1.00     19887
weighted avg       1.00      1.00      1.00     19887

Confusion Matrix 
[[19594     1]
 [    0   292]]


## Layer 2

In [25]:
from sklearn.ensemble import VotingClassifier

#Creating classifiers to be passed to the pipeline
models = list()
models.append(('rfc', RandomForestClassifier(random_state=42, criterion = 'entropy')))
models.append(('etc', ExtraTreesClassifier(random_state=42, criterion = 'entropy')))
models.append(('dtc', DecisionTreeClassifier(random_state=42, criterion = 'entropy')))
models.append(('gbc', GradientBoostingClassifier(random_state=42)))
models.append(('abc', AdaBoostClassifier(random_state=42)))
models.append(('bc', BaggingClassifier(random_state=42)))
models.append(('svm', SVC(random_state=42, probability=True)))

ensemble = VotingClassifier(estimators=models, voting='soft')

In [26]:
#Y_train_2 = Y_train['Level2']
#Y_test_2 = Y_test['Level2']


ensemble.fit(X_train, Y_train_2)

In [27]:
prediction_2 = ensemble.predict(X_test)
print("F1 score",f1_score(Y_test['Level2'], prediction_2, average=None))
print(classification_report(Y_test['Level2'], prediction_2, target_names=['0','1','2','3','4','5','6','7','8']))
print('Confusion Matrix \n' + str(confusion_matrix(Y_test['Level2'], prediction_2)))

F1 score [0.99994897 0.91489362 0.85185185 0.70967742 0.5        0.84
 0.95652174 0.91935484 0.79012346]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19595
           1       0.91      0.91      0.91        47
           2       0.87      0.84      0.85        55
           3       0.65      0.79      0.71        14
           4       1.00      0.33      0.50         3
           5       0.75      0.95      0.84        22
           6       0.96      0.96      0.96        46
           7       0.93      0.90      0.92        63
           8       0.82      0.76      0.79        42

    accuracy                           1.00     19887
   macro avg       0.88      0.83      0.83     19887
weighted avg       1.00      1.00      1.00     19887

Confusion Matrix 
[[19594     0     0     0     0     0     0     0     1]
 [    0    43     2     1     0     0     1     0     0]
 [    0     1    46     5     0     1     1     0     1]
 

## Critique Model

In [28]:
decTree = DecisionTreeClassifier(criterion = "entropy", max_depth=10, min_samples_split=2)
decTree.fit(X_train, Y_train_l)
xgb = xgb.XGBClassifier(n_estimators=250, n_jobs=-1, random_state=42, max_depth=10)
xgb.fit(X_train, Y_train_l)

In [29]:
predictions_3 = xgb.predict(X_test)
print("F1 score",f1_score(Y_test['Level1'], predictions_3, average=None))
print(classification_report(Y_test['Level1'], predictions_3, target_names=['0','1']))
print('Confusion Matrix \n' + str(confusion_matrix(Y_test['Level1'], predictions_3)))

F1 score [1. 1.]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19595
           1       1.00      1.00      1.00       292

    accuracy                           1.00     19887
   macro avg       1.00      1.00      1.00     19887
weighted avg       1.00      1.00      1.00     19887

Confusion Matrix 
[[19595     0]
 [    0   292]]


# Dynamic Implementation

In [30]:
# passing only the TOR entries to the second layer
all_preds = []
for index in range(X_application.shape[0]):
    
    obs = scaler.transform([X_application.iloc[index].values])
    obs = obs.flatten()
    final_pred = 0
    #first layer:
    pred1 = ranForModel.predict([obs])[0]
    final_pred = pred1
    
    if pred1 == 1:
        pred2 = ensemble.predict([obs])[0]
        final_pred = pred2
    else:
        pred2 = 'N'
        
    if pred2 == 0:
        pred3 = xgb.predict([obs])[0]
        print('Critic! ' + str([pred1, pred2, pred3]) + ' ' + str(Y_application.iloc[index, 0]))
        final_pred = pred3
    else:
        pred3 = 'N'
            
    all_preds.append([pred1, pred2, pred3, final_pred])

Critic! [1, 0, 1] 0
Critic! [1, 0, 1] 1
Critic! [1, 0, 1] 1


In [31]:
print("F1 score",f1_score(Y_application['Level2'], (np.array(all_preds)[:, 3]).astype(int), average=None))
print(classification_report(Y_application['Level2'], (np.array(all_preds)[:, 3]).astype(int), target_names=['0','1','2','3','4','5','6','7','8']))
print('Confusion Matrix \n' + str(confusion_matrix(Y_application['Level2'], (np.array(all_preds)[:, 3].astype(int)))))

F1 score [0.99989283 0.86956522 0.87417219 0.84444444 0.4        0.89552239
 0.95522388 0.92485549 0.87603306]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27993
           1       0.85      0.90      0.87        67
           2       0.92      0.84      0.87        79
           3       0.76      0.95      0.84        20
           4       1.00      0.25      0.40         4
           5       0.86      0.94      0.90        32
           6       0.94      0.97      0.96        66
           7       0.95      0.90      0.92        89
           8       0.88      0.87      0.88        61

    accuracy                           1.00     28411
   macro avg       0.91      0.85      0.85     28411
weighted avg       1.00      1.00      1.00     28411

Confusion Matrix 
[[27991     1     0     1     0     0     0     0     0]
 [    0    60     4     0     0     0     1     0     2]
 [    1     6    66     4     0     0     2     0   

In [32]:
# passing the observations for which the first layer predicts 0 and layer 2 predicts 1
all_preds = []
for index in range(X_application.shape[0]):
    
    obs = scaler.transform([X_application.iloc[index].values])
    obs = obs.flatten()
    final_pred = 0
    #first layer:
    pred1 = ranForModel.predict([obs])[0]
    if pred1 == 0:
      pred2 = ensemble.predict([obs])[0]
      if pred2 != 0:
        pred3 = xgb.predict([obs])[0]
        print('Critic! ' + str([pred1, pred2, pred3]) + ' ' + str(Y_application.iloc[index, 0]))


Critic! [0, 5, 1] 1
Critic! [0, 2, 1] 1
Critic! [0, 8, 0] 0
