In [2]:
# Simple script to format the dataset in a pandas dataframe for easy visualization
# SKlearn is compatible with pandas so can be simple to use
# Pandas also provides additional tools for splicing or utilizing data that lists do not

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

In [4]:
# Read the original dataset
data_path = "../dataset/Original/data_injection_and_normal_events_dataset.csv"
df = pd.read_csv(data_path)


In [5]:
# Print out the dataframe in viewable format
df.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4,marker
0,90.578898,130456.2238,-29.415653,129804.3188,-149.410204,130506.3704,88.986075,471.50825,-33.644082,486.15705,...,0,0,0,0,0,0,0,0,0,1
1,90.573168,130506.3704,-29.421383,129854.4653,-149.410204,130556.5169,88.951698,471.50825,-33.644082,485.79083,...,0,0,0,0,0,0,0,0,0,1
2,90.504413,130807.2496,-29.472949,130180.4178,-149.4675,130857.3961,88.367281,473.70557,-33.781592,484.69217,...,0,0,0,0,0,0,0,0,0,1
3,90.429929,130932.6159,-29.553163,130330.8575,-149.536255,131007.8358,87.782864,476.26911,-33.970668,485.05839,...,0,0,0,0,0,0,0,0,0,1
4,90.378363,130982.7625,-29.610459,130355.9307,-149.59928,131057.9823,87.43336,477.73399,-34.068071,485.2415,...,0,0,0,0,0,0,0,0,0,1


In [6]:
df.shape # (Number of sample, Number of features)


(32296, 129)

In [7]:
# inf values in dataset cause issue with sklearn
# Replace with zero
df.replace([np.inf, -np.inf], 0, inplace=True)

In [8]:
# Split into attack and natural to check size
df_attack =  df[df.marker == 1]

df_natural = df[df.marker == 0]

In [9]:
df_attack.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4,marker
0,90.578898,130456.2238,-29.415653,129804.3188,-149.410204,130506.3704,88.986075,471.50825,-33.644082,486.15705,...,0,0,0,0,0,0,0,0,0,1
1,90.573168,130506.3704,-29.421383,129854.4653,-149.410204,130556.5169,88.951698,471.50825,-33.644082,485.79083,...,0,0,0,0,0,0,0,0,0,1
2,90.504413,130807.2496,-29.472949,130180.4178,-149.4675,130857.3961,88.367281,473.70557,-33.781592,484.69217,...,0,0,0,0,0,0,0,0,0,1
3,90.429929,130932.6159,-29.553163,130330.8575,-149.536255,131007.8358,87.782864,476.26911,-33.970668,485.05839,...,0,0,0,0,0,0,0,0,0,1
4,90.378363,130982.7625,-29.610459,130355.9307,-149.59928,131057.9823,87.43336,477.73399,-34.068071,485.2415,...,0,0,0,0,0,0,0,0,0,1


In [10]:
df_attack.shape

(9582, 129)

In [11]:
df_natural.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4,marker
9582,-139.612626,131885.4002,100.410854,131835.2537,-19.572238,131960.62,-137.767702,430.49161,96.153777,462.71897,...,0,0,0,0,0,0,0,0,0,0
9583,-140.174125,131383.9348,99.855085,131358.8615,-20.133737,131459.1546,-138.655786,433.23826,95.506335,464.18385,...,0,0,0,0,0,0,0,0,0,0
9584,-140.603843,131057.9823,99.419637,131007.8358,-20.574914,131133.2021,-139.194367,435.06936,95.185479,465.28251,...,0,0,0,0,0,0,0,0,0,0
9585,-140.684057,130982.7625,99.322234,130932.6159,-20.660858,131057.9823,-139.383443,436.16802,95.145371,465.64873,...,0,0,0,0,0,0,0,0,0,0
9586,-141.990401,130130.2713,98.033079,130105.198,-21.961472,130205.4911,-141.32004,442.21065,94.131236,466.74739,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_natural.shape

(22714, 129)

In [13]:
# Split original dataset into training/testing
train_df, test_df = train_test_split(df, test_size=0.3) # 70/30 split

In [14]:
train_df.shape

(22607, 129)

In [15]:
test_df.shape

(9689, 129)

In [16]:
# Check for types in dataframe to be compatible with SKlearn
for column in train_df:
    if df[column].dtype != 'int64' and df[column].dtype != 'float64':
        print(column)


In [17]:
# Break off the last column to act as the label for ML models
# There is probably a built in pandas function to do this more elegantly

label_train_df = train_df['marker']
train_df.drop('marker', axis=1, inplace=True)

In [18]:
train_df.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log3,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4
25305,-38.9554,132712.8181,-158.949952,132060.9131,81.073528,132762.9647,-37.225068,285.6516,-158.073326,289.13069,...,0,0,0,0,0,0,0,0,0,0
660,75.88826,131133.2021,-44.094832,130506.3704,-164.089383,131183.3486,72.811477,463.63452,-48.311801,468.7616,...,0,0,0,0,0,0,0,0,0,0
2573,37.225068,132386.8656,-82.769483,131734.9606,157.236808,132437.0121,38.817891,261.48108,-80.729753,260.74864,...,0,0,0,0,0,0,0,0,0,0
27285,-56.992112,130130.2713,-176.963745,130130.2713,63.048276,130205.4911,-59.157892,469.86026,-178.350303,466.9305,...,0,0,0,0,0,0,0,0,0,0
25377,-150.82541,131383.9348,89.209529,131383.9348,-30.785022,131484.2279,-153.575607,419.3219,86.631219,420.60367,...,0,0,0,0,0,0,0,0,0,0


In [19]:
label_train_df.shape

(22607,)

In [20]:
test_labels = test_df['marker']
test_df.drop('marker', axis=1, inplace=True)

In [21]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(train_df, label_train_df)

model_prediction = rf_classifier.predict(test_df)

print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      6794
           1       0.92      0.76      0.84      2895

    accuracy                           0.91      9689
   macro avg       0.92      0.87      0.89      9689
weighted avg       0.91      0.91      0.91      9689



In [22]:
# Train SVM classifier
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(train_df, label_train_df)

model_prediction = svm_classifier.predict(test_df)

print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.70      0.98      0.82      6794
           1       0.47      0.03      0.06      2895

    accuracy                           0.70      9689
   macro avg       0.59      0.51      0.44      9689
weighted avg       0.64      0.70      0.59      9689



In [40]:
#TODO: Convert to float64 to int64 for this classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(train_df, label_train_df)

model_prediction = lr_classifier.predict(test_df)
print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.70      0.99      0.82      6794
           1       0.46      0.01      0.02      2895

    accuracy                           0.70      9689
   macro avg       0.58      0.50      0.42      9689
weighted avg       0.63      0.70      0.58      9689



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
nb_classifier = GaussianNB()
nb_classifier.fit(train_df, label_train_df)

model_prediction = nb_classifier.predict(test_df)
print(classification_report(test_labels, model_prediction))


              precision    recall  f1-score   support

           0       0.70      0.98      0.82      6794
           1       0.34      0.02      0.04      2895

    accuracy                           0.70      9689
   macro avg       0.52      0.50      0.43      9689
weighted avg       0.59      0.70      0.59      9689



In [25]:
knn_classifier = KNeighborsClassifier(n_neighbors=7)
knn_classifier.fit(train_df, label_train_df)

model_prediction = knn_classifier.predict(test_df)
print(classification_report(test_labels, model_prediction))


              precision    recall  f1-score   support

           0       0.83      0.90      0.86      6794
           1       0.71      0.57      0.63      2895

    accuracy                           0.80      9689
   macro avg       0.77      0.74      0.75      9689
weighted avg       0.80      0.80      0.80      9689



In [26]:
dt_classifier = DecisionTreeClassifier(max_depth=3)
dt_classifier.fit(train_df, label_train_df)

model_prediction = dt_classifier.predict(test_df)

print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      6794
           1       0.75      0.02      0.05      2895

    accuracy                           0.71      9689
   macro avg       0.73      0.51      0.44      9689
weighted avg       0.72      0.71      0.59      9689



In [27]:
ada_classifier = AdaBoostClassifier()
ada_classifier.fit(train_df, label_train_df)

model_prediction = ada_classifier.predict(test_df)
print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.72      0.96      0.82      6794
           1       0.54      0.10      0.17      2895

    accuracy                           0.71      9689
   macro avg       0.63      0.53      0.50      9689
weighted avg       0.66      0.71      0.63      9689



In [28]:
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(train_df, label_train_df)

model_prediction = gb_classifier.predict(test_df)
print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.72      0.99      0.84      6794
           1       0.81      0.12      0.21      2895

    accuracy                           0.73      9689
   macro avg       0.77      0.55      0.52      9689
weighted avg       0.75      0.73      0.65      9689



In [29]:
b_svc_classifier = BaggingClassifier(base_estimator=SVC(),
                                     n_estimators=10)
b_svc_classifier.fit(train_df, label_train_df)

model_prediction = b_svc_classifier.predict(test_df)
print(classification_report(test_labels, model_prediction))

              precision    recall  f1-score   support

           0       0.70      1.00      0.82      6794
           1       0.00      0.00      0.00      2895

    accuracy                           0.70      9689
   macro avg       0.35      0.50      0.41      9689
weighted avg       0.49      0.70      0.58      9689



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
