In [129]:
# Simple script to format the dataset in a pandas dataframe for easy visualization
# SKlearn is compatible with pandas so can be simple to use
# Pandas also provides additional tools for splicing or utilizing data that lists do not

In [160]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [131]:
# Read the original dataset
data_path = "../dataset/Original/data_injection_and_normal_events_dataset.csv"
df = pd.read_csv(data_path)


In [132]:
# Print out the dataframe in viewable format
df.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4,marker
0,90.578898,130456.2238,-29.415653,129804.3188,-149.410204,130506.3704,88.986075,471.50825,-33.644082,486.15705,...,0,0,0,0,0,0,0,0,0,1
1,90.573168,130506.3704,-29.421383,129854.4653,-149.410204,130556.5169,88.951698,471.50825,-33.644082,485.79083,...,0,0,0,0,0,0,0,0,0,1
2,90.504413,130807.2496,-29.472949,130180.4178,-149.4675,130857.3961,88.367281,473.70557,-33.781592,484.69217,...,0,0,0,0,0,0,0,0,0,1
3,90.429929,130932.6159,-29.553163,130330.8575,-149.536255,131007.8358,87.782864,476.26911,-33.970668,485.05839,...,0,0,0,0,0,0,0,0,0,1
4,90.378363,130982.7625,-29.610459,130355.9307,-149.59928,131057.9823,87.43336,477.73399,-34.068071,485.2415,...,0,0,0,0,0,0,0,0,0,1


In [133]:
df.shape # (Number of sample, Number of features)


(32296, 129)

In [134]:
# inf values in dataset cause issue with sklearn
# Replace with zero
df.replace([np.inf, -np.inf], 0, inplace=True)

In [135]:
# Split into attack and natural to check size
df_attack =  df[df.marker == 1]

df_natural = df[df.marker == 0]

In [136]:
df_attack.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4,marker
0,90.578898,130456.2238,-29.415653,129804.3188,-149.410204,130506.3704,88.986075,471.50825,-33.644082,486.15705,...,0,0,0,0,0,0,0,0,0,1
1,90.573168,130506.3704,-29.421383,129854.4653,-149.410204,130556.5169,88.951698,471.50825,-33.644082,485.79083,...,0,0,0,0,0,0,0,0,0,1
2,90.504413,130807.2496,-29.472949,130180.4178,-149.4675,130857.3961,88.367281,473.70557,-33.781592,484.69217,...,0,0,0,0,0,0,0,0,0,1
3,90.429929,130932.6159,-29.553163,130330.8575,-149.536255,131007.8358,87.782864,476.26911,-33.970668,485.05839,...,0,0,0,0,0,0,0,0,0,1
4,90.378363,130982.7625,-29.610459,130355.9307,-149.59928,131057.9823,87.43336,477.73399,-34.068071,485.2415,...,0,0,0,0,0,0,0,0,0,1


In [137]:
df_attack.shape

(9582, 129)

In [138]:
df_natural.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4,marker
9582,-139.612626,131885.4002,100.410854,131835.2537,-19.572238,131960.62,-137.767702,430.49161,96.153777,462.71897,...,0,0,0,0,0,0,0,0,0,0
9583,-140.174125,131383.9348,99.855085,131358.8615,-20.133737,131459.1546,-138.655786,433.23826,95.506335,464.18385,...,0,0,0,0,0,0,0,0,0,0
9584,-140.603843,131057.9823,99.419637,131007.8358,-20.574914,131133.2021,-139.194367,435.06936,95.185479,465.28251,...,0,0,0,0,0,0,0,0,0,0
9585,-140.684057,130982.7625,99.322234,130932.6159,-20.660858,131057.9823,-139.383443,436.16802,95.145371,465.64873,...,0,0,0,0,0,0,0,0,0,0
9586,-141.990401,130130.2713,98.033079,130105.198,-21.961472,130205.4911,-141.32004,442.21065,94.131236,466.74739,...,0,0,0,0,0,0,0,0,0,0


In [139]:
df_natural.shape

(22714, 129)

In [140]:
# Split original dataset into training/testing
train_df, test_df = train_test_split(df, test_size=0.3) # 70/30 split

In [141]:
train_df.shape

(22607, 129)

In [142]:
test_df.shape

(9689, 129)

In [143]:
# Check for types in dataframe to be compatible with SKlearn
for column in train_df:
    if df[column].dtype != 'int64' and df[column].dtype != 'float64':
        print(column)


In [144]:
# Break off the last column to act as the label for ML models
# There is probably a built in pandas function to do this more elegantly

label_train_df = train_df['marker']
train_df.drop('marker', axis=1, inplace=True)

In [145]:
train_df.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log3,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4
11476,58.23543,131760.0339,-61.753391,131133.2021,178.258629,131835.2537,56.379047,377.2066,-63.867605,378.48837,...,0,0,0,0,0,0,0,0,0,0
22103,3.80444,132136.1329,-116.195841,131484.2279,123.81045,132186.2794,-1.678766,502.82006,-124.263087,516.55331,...,0,0,0,0,0,0,0,0,0,0
9811,147.16421,131333.7883,27.163929,130681.8832,-92.801974,131383.9348,143.611871,444.9573,23.34803,446.23907,...,0,0,0,0,0,0,0,0,0,0
12025,-122.796315,131383.9348,117.238624,131333.7883,-2.761657,131459.1546,-124.950636,413.8286,114.259244,418.95568,...,0,0,0,0,0,0,0,0,0,0
6698,-88.464684,131057.9823,151.558796,131032.909,31.569975,131133.2021,-87.599517,391.67229,149.083618,409.61707,...,0,0,0,0,0,0,0,0,0,0


In [146]:
label_train_df.shape

(22607,)

In [147]:
test_labels = test_df['marker']
test_df.drop('marker', axis=1, inplace=True)

In [148]:
# Train Decision Tree model
decision_tree_classifier = DecisionTreeClassifier(max_depth=3)
decision_tree_classifier.fit(train_df, label_train_df)

DecisionTreeClassifier(max_depth=3)

In [149]:
model_prediction = decision_tree_classifier.predict(test_df)

In [150]:
print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      6811
           1       0.77      0.02      0.05      2878

    accuracy                           0.71      9689
   macro avg       0.74      0.51      0.44      9689
weighted avg       0.73      0.71      0.60      9689



In [151]:
# Train SVM classifier
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(train_df.iloc[:,0:2].values, label_train_df.values)



SVC(kernel='linear')

In [152]:
# Test using testing set
model_prediction = svm_classifier.predict(test_df.iloc[:,0:2].values)

print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.70      1.00      0.83      6811
           1       0.64      0.00      0.01      2878

    accuracy                           0.70      9689
   macro avg       0.67      0.50      0.42      9689
weighted avg       0.69      0.70      0.58      9689



In [161]:
print(confusion_matrix(test_labels,model_prediction))

[[6806    5]
 [2869    9]]
