### Dataset creation

In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

- load data and labels (normal and different attack types)

In [4]:
df_data = pd.read_csv("../../datasets/arythmia/arythmia.csv", na_values='?').fillna(0)
df_data = shuffle(df_data)
df_labels = pd.DataFrame(df_data.iloc[:,-1]).astype(str)   
df_data = pd.DataFrame(df_data.iloc[:, :-1]).astype(float) 

- read column names and types

In [5]:
# col_names=[]
# col_datatypes=[]

# with open("datasets/kddcup/kddcup.names") as file:
#     next(file)    ### skip first line
#     for line in file:
#         name, datatype = line.split(": ")
#         col_names.append(name)
#         col_datatypes.append(datatype.replace(".\n",""))
        
# df_data.columns = col_names
df_labels.columns = ["labels"]

In [6]:
df_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jj,jk,jl,jm,jn,jo,jp,jq,jr,js
78,56.0,1.0,165.0,73.0,90.0,147.0,388.0,178.0,92.0,88.0,...,-1.0,-0.5,14.8,-2.4,0.0,0.0,1.4,2.4,26.8,51.2
139,51.0,1.0,160.0,80.0,85.0,167.0,355.0,151.0,127.0,36.0,...,0.2,0.0,6.4,-1.3,0.0,0.0,0.6,1.5,12.0,24.0
43,34.0,1.0,165.0,61.0,84.0,152.0,383.0,144.0,71.0,59.0,...,-0.5,-1.0,16.3,0.0,0.0,0.0,0.4,-1.1,37.9,22.1
290,60.0,1.0,158.0,75.0,91.0,137.0,387.0,188.0,97.0,-16.0,...,-0.2,0.0,7.0,-2.5,0.0,0.0,0.2,0.3,13.1,14.9
55,72.0,1.0,160.0,70.0,77.0,142.0,392.0,160.0,88.0,30.0,...,-0.2,0.0,9.0,0.0,0.0,0.0,0.5,2.6,27.0,46.7


- check integrity of the data

In [7]:
print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


### Enumerate labels 

- ### normal samples are named as "anomaly" since they are minority (stated in paper)

In [8]:
# Considering n labelled data as normal and others as anomaly in thyroid dataset. assigning labels accordingly.
df_data["label"] = np.where(df_labels["labels"].str.contains("3|4|5|7|8|9|14|15", regex=True), 'anomaly', 'normal')
print(df_data['label'])

78      normal
139     normal
43      normal
290     normal
55      normal
145     normal
40      normal
293     normal
347     normal
414     normal
15      normal
123     normal
205     normal
240     normal
230     normal
64      normal
357     normal
30      normal
408     normal
431     normal
132     normal
162     normal
52      normal
126     normal
281     normal
339     normal
112     normal
157     normal
194     normal
207    anomaly
        ...   
236     normal
181     normal
403    anomaly
231    anomaly
60     anomaly
446     normal
399     normal
171     normal
160     normal
129     normal
404     normal
61     anomaly
291     normal
244     normal
289     normal
200     normal
101     normal
90      normal
368     normal
164     normal
405     normal
299     normal
427     normal
265     normal
441     normal
344     normal
279     normal
303    anomaly
87     anomaly
197     normal
Name: label, Length: 452, dtype: object


In [9]:
### number of normal and abnormal
print("number of normal:", len(np.where(df_data["label"] == 'normal')[0]))
print("number of anomaly:", len(np.where(df_data["label"] == 'anomaly')[0]))

number of normal: 386
number of anomaly: 66


In [10]:
## check in which location data is abnormal

np.where(df_data["label"] == 'anomaly')

(array([ 29,  31,  40,  52,  60,  66,  74,  77,  87,  98, 116, 124, 127,
        132, 139, 140, 145, 152, 153, 158, 160, 164, 169, 175, 188, 189,
        206, 208, 222, 229, 230, 238, 248, 256, 260, 264, 271, 278, 279,
        284, 285, 291, 292, 293, 301, 324, 325, 336, 341, 359, 360, 371,
        376, 387, 388, 399, 401, 402, 408, 415, 424, 425, 426, 433, 449,
        450]),)

In [11]:
### looks like working

print(df_data.iloc[100:150, -1])
print(df_labels.iloc[100:150, -1])
# df_data.head()

147     normal
393     normal
451     normal
36      normal
62      normal
345     normal
365     normal
148     normal
229     normal
391     normal
380     normal
106     normal
155     normal
44      normal
159     normal
343     normal
88     anomaly
74      normal
72      normal
38      normal
366     normal
107     normal
324     normal
70      normal
45     anomaly
406     normal
17      normal
218    anomaly
336     normal
266     normal
14      normal
333     normal
376    anomaly
311     normal
51      normal
447     normal
438     normal
255     normal
177     normal
89     anomaly
285    anomaly
390     normal
7       normal
99      normal
328     normal
251    anomaly
402     normal
292     normal
73      normal
120     normal
Name: label, dtype: object
147     1
393     1
451     1
36      1
62      1
345     1
365     1
148     1
229     1
391     1
380    16
106     6
155     1
44      1
159     1
343     6
88      9
74      1
72      1
38      1
366     2
107     1
324

In [12]:
### for each object column in data

cols_toOneHot = df_data.select_dtypes(include='object').columns.tolist()

cols_all = df_data.columns.tolist()
cols_toNormalize = list(set(cols_all) - set(cols_toOneHot))

In [13]:
print("columns to normalize:\n%s\n" % cols_toNormalize)
print("columns to one-hot:\n%s\n" % cols_toOneHot)

columns to normalize:
['ay', 'bu', 'jp', 'jc', 'fp', 'dx', 'cj', 'du', 'm', 'bv', 'y', 'ac', 'cr', 'as', 'dc', 'ao', 'ja', 'jf', 'ha', 'hl', 'fw', 'ck', 'bx', 'fu', 'cd', 'bz', 'ij', 'ik', 'ey', 'gd', 'ab', 'in', 'jh', 'ht', 'ct', 'jb', 'hk', 'ft', 'gm', 'bn', 'cz', 'gz', 'az', 'dp', 'fe', 'hc', 'et', 'ff', 'hy', 'cn', 'i', 'cm', 'eb', 'gw', 'hh', 'ei', 'hu', 'bj', 'fk', 'ap', 'dm', 't', 'aq', 'aa', 'ci', 'ch', 'gc', 's', 'hj', 'b', 'hd', 'gk', 'bw', 'gq', 'br', 'df', 'ee', 'aw', 'hf', 'cp', 'ia', 'ec', 'jd', 'fo', 'ix', 'ce', 'ge', 'fv', 'a', 'hq', 'iv', 'hi', 'gt', 'ah', 'gl', 'iq', 'bb', 'w', 'fl', 'f', 'gg', 'ew', 'cg', 'ho', 'cf', 'jk', 'dq', 'ex', 'gv', 'ea', 'ji', 'cx', 'ga', 'gj', 'ak', 'cy', 'p', 'em', 'io', 'fg', 'e', 'bf', 'dn', 'eq', 'di', 'hn', 'el', 'dy', 'l', 'av', 'n', 'fi', 'v', 'je', 'gx', 'u', 'gu', 'fa', 'he', 'fc', 'bi', 'jm', 'cc', 'fx', 'bl', 'es', 'fr', 'eh', 'cb', 'fj', 'bh', 'eo', 'if', 'gr', 'h', 'af', 'fq', 'ae', 'hb', 'ax', 'dg', 'bs', 'hz', 'am', 'go', 'it

In [14]:
len(cols_toNormalize)

279

### Dataset preprocessing in numerical columns

In [15]:
print(df_data.aa.dtype)

float64


In [16]:
def checkStats(df_data):
    i=0
    for col in df_data.columns.tolist():
            
        if col in cols_toOneHot:    ### skip if column is not numerical  ### TODO: check column type??
            i += 1
            continue

        min_ = df_data[col].values.min()
        max_ = df_data[col].values.max()
        std_ = df_data[col].values.std()

        print("column:%d min:%f max:%f std:%f" % (i, min_, max_, std_))
        i += 1

In [17]:
## before normalization
df_data = df_data
checkStats(df_data)

column:0 min:0.000000 max:83.000000 std:16.448406
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:105.000000 max:780.000000 std:37.129200
column:3 min:6.000000 max:176.000000 std:16.572440
column:4 min:55.000000 max:188.000000 std:15.347388
column:5 min:0.000000 max:524.000000 std:44.792651
column:6 min:232.000000 max:509.000000 std:33.348470
column:7 min:108.000000 max:381.000000 std:35.593633
column:8 min:0.000000 max:205.000000 std:25.798058
column:9 min:-172.000000 max:169.000000 std:45.381150
column:10 min:-177.000000 max:179.000000 std:57.477332
column:11 min:-170.000000 max:176.000000 std:30.465943
column:12 min:-135.000000 max:166.000000 std:35.982320
column:13 min:-179.000000 max:178.000000 std:52.071135
column:14 min:0.000000 max:163.000000 std:14.275317
column:15 min:0.000000 max:88.000000 std:10.638214
column:16 min:0.000000 max:156.000000 std:18.229702
column:17 min:0.000000 max:88.000000 std:20.518993
column:18 min:0.000000 max:24.000000 std:1.567746
column:1

In [18]:
### normalization

### min substr and max-min div

def normalise(df_data, cols_toNormalize):
    for col in cols_toNormalize:
            
        min = df_data[col].min()
        max = df_data[col].max()

        if max - min == 0:
            continue

        df_data[col] = (df_data[col] - min) / (max - min)
    
    return df_data



"""
### mean substr and std dev div
for col in cols_toNormalize:
    mean = df_data[col].mean()
    std = df_data[col].std()
    
    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!
        continue
    
    df_data[col] = (df_data[col] - mean) / std
"""

"\n### mean substr and std dev div\nfor col in cols_toNormalize:\n    mean = df_data[col].mean()\n    std = df_data[col].std()\n    \n    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!\n        continue\n    \n    df_data[col] = (df_data[col] - mean) / std\n"

In [19]:
# ### Skipping normalisation for thyroid dataset. Added by Jyotirmay
df_data=normalise(df_data, cols_toNormalize)
checkStats(df_data)


column:0 min:0.000000 max:1.000000 std:0.198174
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:0.000000 max:1.000000 std:0.055006
column:3 min:0.000000 max:1.000000 std:0.097485
column:4 min:0.000000 max:1.000000 std:0.115394
column:5 min:0.000000 max:1.000000 std:0.085482
column:6 min:0.000000 max:1.000000 std:0.120392
column:7 min:0.000000 max:1.000000 std:0.130380
column:8 min:0.000000 max:1.000000 std:0.125844
column:9 min:0.000000 max:1.000000 std:0.133083
column:10 min:0.000000 max:1.000000 std:0.161453
column:11 min:0.000000 max:1.000000 std:0.088052
column:12 min:0.000000 max:1.000000 std:0.119543
column:13 min:0.000000 max:1.000000 std:0.145858
column:14 min:0.000000 max:1.000000 std:0.087579
column:15 min:0.000000 max:1.000000 std:0.120889
column:16 min:0.000000 max:1.000000 std:0.116857
column:17 min:0.000000 max:1.000000 std:0.233170
column:18 min:0.000000 max:1.000000 std:0.065323
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:1.0

### One hot encoding of required columns

In [20]:
### obtain one hot encoding

df_data = pd.get_dummies(df_data, columns=cols_toOneHot)

- Check the resulting dataset

In [21]:
df_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jl,jm,jn,jo,jp,jq,jr,js,label_anomaly,label_normal
78,0.674699,1.0,0.088889,0.394118,0.263158,0.280534,0.563177,0.25641,0.44878,0.762463,...,0.627119,0.916084,0.0,0.0,0.6875,0.7,0.533835,0.58123,0,1
139,0.614458,1.0,0.081481,0.435294,0.225564,0.318702,0.444043,0.157509,0.619512,0.609971,...,0.271186,0.954545,0.0,0.0,0.4375,0.625,0.422556,0.405178,0,1
43,0.409639,1.0,0.088889,0.323529,0.218045,0.290076,0.545126,0.131868,0.346341,0.677419,...,0.690678,1.0,0.0,0.0,0.375,0.408333,0.617293,0.39288,0,1
290,0.722892,1.0,0.078519,0.405882,0.270677,0.26145,0.559567,0.29304,0.473171,0.457478,...,0.29661,0.912587,0.0,0.0,0.3125,0.525,0.430827,0.346278,0,1
55,0.86747,1.0,0.081481,0.376471,0.165414,0.270992,0.577617,0.190476,0.429268,0.592375,...,0.381356,1.0,0.0,0.0,0.40625,0.716667,0.535338,0.552104,0,1


In [22]:
### shape is correct

df_data.shape

(452, 281)

In [23]:
df_data.describe()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jl,jm,jn,jo,jp,jq,jr,js,label_anomaly,label_normal
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,...,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0
mean,0.559894,0.550885,0.090649,0.365708,0.25504,0.296093,0.488115,0.22692,0.439046,0.603158,...,0.38339,0.949045,0.004978,0.0,0.410882,0.601862,0.47764,0.440603,0.146018,0.853982
std,0.198393,0.497955,0.055067,0.097593,0.115522,0.085577,0.120525,0.130524,0.125984,0.13323,...,0.147155,0.070015,0.062648,0.0,0.108603,0.118838,0.101533,0.119702,0.353515,0.353515
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.433735,0.0,0.081481,0.311765,0.18797,0.270992,0.425993,0.14652,0.385366,0.515396,...,0.279661,0.926573,0.0,0.0,0.375,0.541667,0.418421,0.36343,0.0,1.0
50%,0.566265,1.0,0.087407,0.364706,0.233083,0.299618,0.487365,0.197802,0.443902,0.621701,...,0.372881,0.961538,0.0,0.0,0.40625,0.6125,0.468421,0.430421,0.0,1.0
75%,0.698795,1.0,0.096296,0.429412,0.293233,0.333969,0.548736,0.260073,0.497561,0.697947,...,0.474576,1.0,0.0,0.0,0.46875,0.675,0.526504,0.516019,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
### check again

checkStats(df_data)

column:0 min:0.000000 max:1.000000 std:0.198174
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:0.000000 max:1.000000 std:0.055006
column:3 min:0.000000 max:1.000000 std:0.097485
column:4 min:0.000000 max:1.000000 std:0.115394
column:5 min:0.000000 max:1.000000 std:0.085482
column:6 min:0.000000 max:1.000000 std:0.120392
column:7 min:0.000000 max:1.000000 std:0.130380
column:8 min:0.000000 max:1.000000 std:0.125844
column:9 min:0.000000 max:1.000000 std:0.133083
column:10 min:0.000000 max:1.000000 std:0.161453
column:11 min:0.000000 max:1.000000 std:0.088052
column:12 min:0.000000 max:1.000000 std:0.119543
column:13 min:0.000000 max:1.000000 std:0.145858
column:14 min:0.000000 max:1.000000 std:0.087579
column:15 min:0.000000 max:1.000000 std:0.120889
column:16 min:0.000000 max:1.000000 std:0.116857
column:17 min:0.000000 max:1.000000 std:0.233170
column:18 min:0.000000 max:1.000000 std:0.065323
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:1.0

In [25]:
## check integrity of the data again

print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


In [26]:
### check again
print(df_data.iloc[140:150, -1])

print(df_data.iloc[140:150, -2])

print(df_labels.iloc[140:150, -1])

285    0
390    1
7      1
99     1
328    1
251    0
402    1
292    1
73     1
120    1
Name: label_normal, dtype: uint8
285    1
390    0
7      0
99     0
328    0
251    1
402    0
292    0
73     0
120    0
Name: label_anomaly, dtype: uint8
285    3
390    1
7      1
99     1
328    1
251    4
402    1
292    1
73     1
120    1
Name: labels, dtype: object


### Save dataset

In [27]:
dataset = df_data.values

In [28]:
dataset.shape

(452, 281)

In [31]:
np.savez_compressed("../../datasets/arythmia/arythmia.npz", dataset=dataset)

### Produce train and test splits

In [32]:
random_state = None

In [33]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(dataset, test_size=0.5, random_state=random_state)

In [34]:
x_train.shape

(226, 281)

In [35]:
x_test.shape

(226, 281)

-  Normal class is anomaly (attack) in this case, because non-attack rows are minority

In [36]:
x_train_normal = x_train[np.where(x_train[:,-1] == 1)]   ##last column is the normal column
x_train_anomaly = x_train[np.where(x_train[:,-1] == 0)]   

In [37]:
x_train_normal.shape

(198, 281)

In [38]:
x_train_anomaly.shape   

(28, 281)

In [39]:
x_train = x_train_normal

- x_train consist of "normal" values

In [40]:
x_train.shape

(198, 281)

In [41]:
x_test.shape

(226, 281)

- Assign arbitrary output as labels

In [42]:
y_train = np.zeros((len(x_train),2))
y_train[:,0] = 1
y_train.shape

(198, 2)

In [43]:
train_name = "../../datasets/arythmia/arythmia_train-randomState_"+str(random_state)+".npz"
test_name = "../../datasets/arythmia/arythmia_test-randomState_"+str(random_state)+".npz"

np.savez_compressed(train_name, x_train=x_train, y_train=y_train)
np.savez_compressed(test_name, x_test=x_test)