In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dataset_prep

In [3]:
data = dataset_prep.import_dataset("data/DoS_0709_new_columns.csv")

In [5]:
dataset_prep.clean_dataset(data)

Nombre de lignes avant nettoyage :  4753433
Nombre de lignes après nettoyage :  4709293


In [6]:
data = dataset_prep.sample_dataset(data, 50000)

In [7]:
data.shape

(50000, 8)

In [11]:
data["label"].value_counts()

13    28202
0     21798
Name: label, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

X = np.array(data.drop(["label"], axis=1))
y = np.array(data["label"])

for i in range(len(y)):
    if y[i] == 13:
        y[i] = 1

# Séparation en données d'entrainement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [17]:
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# instantiate the classifier 
rfc = RandomForestClassifier(random_state=0)

# fit the model
rfc.fit(X_train, y_train)

# Predict the Test set results
y_pred = rfc.predict(X_test)

# Check accuracy score 
from sklearn.metrics import accuracy_score

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Model accuracy score with 10 decision-trees : 0.8674


In [18]:
# instantiate the classifier with n_estimators = 100

rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)



# fit the model to the training set

rfc_100.fit(X_train, y_train)



# Predict on the test set results

y_pred_100 = rfc_100.predict(X_test)



# Check accuracy score 

print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred_100)))

Model accuracy score with 100 decision-trees : 0.8674


In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_100))

              precision    recall  f1-score   support

           0       0.81      0.90      0.86      2174
           1       0.92      0.84      0.88      2826

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



Feature selection

In [3]:
# On importe le dataset avec toutes les colonnes pour faire une feature selection

data = pd.read_csv("../data/DoS_0709_new_columns.csv",
        index_col=False,
    )

In [4]:
dataset_prep.clean_dataset(data)
data = dataset_prep.sample_dataset(data, 50000)

Nombre de lignes avant nettoyage :  4753433
Nombre de lignes après nettoyage :  4518383


In [5]:
data["label"].value_counts()

13    28468
0     21532
Name: label, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X = np.array(data.drop(["label"], axis=1))
y = np.array(data["label"])

for i in range(len(y)):
    if y[i] == 13:
        y[i] = 1

# Séparation en données d'entrainement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [8]:
# import Random Forest classifier

from sklearn.ensemble import RandomForestClassifier

# create the classifier with n_estimators = 100

clf = RandomForestClassifier(n_estimators=100, random_state=0)



# fit the model to the training set

clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [9]:
print(data.drop(["label"], axis=1).columns)

Index(['type', 'rcvTime', 'sendTime', 'sender', 'senderPseudo', 'messageID',
       'labelRec', 'receiver', 'moduleid', 'pos_x_send', 'pos_y_send',
       'pos_noise_x', 'pos_noise_y', 'spd_x_send', 'spd_y_send', 'spd_noise_x',
       'spd_noise_y', 'acl_x_send', 'acl_y_send', 'acl_noise_x', 'acl_noise_y',
       'hed_x_send', 'hed_y_send', 'hed_noise_x', 'hed_noise_y', 'scenario',
       'pos_x_send_f', 'pos_y_send_f', 'spd_x_send_f', 'spd_y_send_f',
       'acl_x_send_f', 'acl_y_send_f', 'hed_x_send_f', 'hed_y_send_f',
       'pos_x_rec', 'pos_y_rec', 'spd_x_rec', 'spd_y_rec', 'acl_x_rec',
       'acl_y_rec', 'hed_x_rec', 'hed_y_rec', 'pos_x_rec_f', 'pos_y_rec_f',
       'spd_x_rec_f', 'spd_y_rec_f', 'acl_x_rec_f', 'acl_y_rec_f',
       'hed_x_rec_f', 'hed_y_rec_f', 'max_speed1', 'max_speed2',
       'nb_packets_sent', 'frequency1', 'frequency2', 'time_diff',
       'distRealSR1', 'distRealSR2', 'diffSpdSR1', 'diffSpdSR2', 'diffAclSR1',
       'diffAclSR2', 'diffHedSR1', 'diffHedSR2'

In [10]:
# view the feature scores

feature_scores = pd.Series(clf.feature_importances_, index=data.drop(["label"], axis=1).columns).sort_values(ascending=False)

feature_scores[:15]

time_diff          0.616775
nb_packets_sent    0.216895
max_speed2         0.010222
max_speed1         0.009850
sender             0.007070
senderPseudo       0.006823
rcvTime            0.004741
messageID          0.004704
sendTime           0.004600
hed_noise_x        0.004287
hed_noise_y        0.003947
pos_x_send_f       0.003864
moduleid           0.003610
frequency1         0.003581
hed_y_send         0.003520
dtype: float64

In [15]:
selected_columns = ["time_diff",
"nb_packets_sent",
"max_speed1",
"max_speed2",
"hed_noise_x",
"sendTime",
"rcvTime",
"hed_noise_y",
]

In [16]:
data = data[selected_columns]
data.head()

Unnamed: 0,time_diff,nb_packets_sent,max_speed1,max_speed2,hed_noise_x,sendTime,rcvTime,hed_noise_y
4303919,0.499989,3987,2.844516,8.4e-05,10.009405,31849.123586,31849.123586,10.009405
465063,0.499996,1754,15.58077,15.574555,5.146283,26602.020234,26602.020234,7.16779
2768324,1.00001,1703,10.132631,10.107281,0.25903,30035.937842,30035.937842,0.480312
1119575,0.999996,281,16.811452,13.421082,11.794527,27696.32548,27696.32548,15.402174
2819481,0.250001,3932,12.011449,4.318957,7.207966,30013.710952,30013.710952,10.472565


In [17]:
from sklearn.model_selection import train_test_split

X = np.array(data)

# Séparation en données d'entrainement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [22]:
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# instantiate the classifier 
rfc = RandomForestClassifier(random_state=0)

# fit the model
rfc.fit(X_train, y_train)

# Predict the Test set results
y_pred = rfc.predict(X_test)

# Check accuracy score 
from sklearn.metrics import accuracy_score

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

Model accuracy score with 10 decision-trees : 0.9994
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2225
           1       1.00      1.00      1.00      2775

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000



In [24]:
# instantiate the classifier with n_estimators = 100
rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)

# fit the model to the training set
rfc_100.fit(X_train, y_train)

# Predict on the test set results
y_pred_100 = rfc_100.predict(X_test)

# Check accuracy score 
print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred_100)))

Model accuracy score with 100 decision-trees : 0.9994


In [25]:
print(classification_report(y_test, y_pred_100))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2225
           1       1.00      1.00      1.00      2775

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000

