### Dataset creation

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

- load data and labels (normal and different attack types)

In [2]:
df_data = pd.read_csv("../../datasets/arythmia/arythmia.csv", na_values='?').fillna(0)
df_data = shuffle(df_data)
df_labels = pd.DataFrame(df_data.iloc[:,-1]).astype(str)   
df_data = pd.DataFrame(df_data.iloc[:, :-1]).astype(float) 

- read column names and types

In [3]:
# col_names=[]
# col_datatypes=[]

# with open("datasets/kddcup/kddcup.names") as file:
#     next(file)    ### skip first line
#     for line in file:
#         name, datatype = line.split(": ")
#         col_names.append(name)
#         col_datatypes.append(datatype.replace(".\n",""))
        
# df_data.columns = col_names
df_labels.columns = ["labels"]

In [4]:
df_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jj,jk,jl,jm,jn,jo,jp,jq,jr,js
8,44.0,0.0,168.0,56.0,84.0,118.0,354.0,160.0,63.0,61.0,...,0.1,0.0,7.0,-1.3,0.0,0.0,0.6,2.1,12.5,30.9
67,26.0,1.0,160.0,65.0,71.0,150.0,350.0,165.0,81.0,66.0,...,-0.3,-0.4,12.1,0.0,0.0,0.0,-0.3,2.5,26.4,43.4
338,39.0,1.0,160.0,70.0,87.0,160.0,357.0,178.0,89.0,66.0,...,0.1,0.0,8.4,-1.4,0.0,0.0,0.4,2.6,17.6,44.1
309,40.0,0.0,176.0,74.0,92.0,216.0,362.0,161.0,86.0,83.0,...,0.1,0.0,5.4,0.0,0.0,0.0,0.0,1.4,14.0,25.7
5,13.0,0.0,169.0,51.0,100.0,167.0,321.0,174.0,91.0,107.0,...,0.0,-0.6,12.2,-2.8,0.0,0.0,0.9,2.2,13.5,31.1


- check integrity of the data

In [5]:
print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


### Enumerate labels 

- ### normal samples are named as "anomaly" since they are minority (stated in paper)

In [6]:
# Considering n labelled data as normal and others as anomaly in thyroid dataset. assigning labels accordingly.
df_data["label"] = np.where(df_labels["labels"].str.contains("3|4|5|7|8|9|14|15", regex=True), 'anomaly', 'normal')
print(df_data['label'])

8       normal
67      normal
338     normal
309    anomaly
5      anomaly
402     normal
175     normal
226     normal
351     normal
210     normal
173     normal
413     normal
69      normal
279     normal
418     normal
368     normal
235     normal
149     normal
288     normal
70      normal
306     normal
78      normal
177     normal
268     normal
440     normal
59      normal
3       normal
206     normal
357     normal
33      normal
        ...   
253    anomaly
347     normal
307     normal
22      normal
285    anomaly
32      normal
99      normal
411     normal
195     normal
68      normal
51      normal
60     anomaly
371     normal
172     normal
104     normal
360     normal
139     normal
393     normal
294     normal
44      normal
342     normal
246     normal
295     normal
302     normal
185    anomaly
217    anomaly
366     normal
7       normal
376    anomaly
174    anomaly
Name: label, Length: 452, dtype: object


In [7]:
### number of normal and abnormal
print("number of normal:", len(np.where(df_data["label"] == 'normal')[0]))
print("number of anomaly:", len(np.where(df_data["label"] == 'anomaly')[0]))

number of normal: 386
number of anomaly: 66


In [8]:
## check in which location data is abnormal

np.where(df_data["label"] == 'anomaly')

(array([  3,   4,  44,  50,  64,  69,  75,  83,  84,  94,  97,  98, 102,
        106, 124, 128, 132, 136, 137, 158, 163, 165, 166, 167, 168, 170,
        186, 192, 195, 196, 197, 202, 205, 209, 223, 233, 240, 246, 259,
        264, 270, 279, 289, 294, 295, 301, 316, 326, 334, 354, 358, 369,
        379, 384, 394, 403, 413, 415, 419, 422, 426, 433, 446, 447, 450,
        451]),)

In [9]:
### looks like working

print(df_data.iloc[100:150, -1])
print(df_labels.iloc[100:150, -1])
# df_data.head()

369     normal
385     normal
403    anomaly
250     normal
389     normal
184     normal
231    anomaly
81      normal
35      normal
227     normal
230     normal
373     normal
355     normal
161     normal
245     normal
126     normal
170     normal
436     normal
346     normal
380     normal
431     normal
399     normal
116     normal
74      normal
34     anomaly
213     normal
408     normal
313     normal
83     anomaly
283     normal
345     normal
432     normal
243    anomaly
138     normal
384     normal
53      normal
361    anomaly
303    anomaly
331     normal
109     normal
273     normal
390     normal
137     normal
86      normal
406     normal
379     normal
144     normal
330     normal
58      normal
182     normal
Name: label, dtype: object
369     1
385     1
403     5
250    10
389     1
184     2
231     4
81      1
35      1
227    16
230     1
373     1
355     1
161     1
245     2
126     1
170     1
436     1
346    10
380    16
431     6
399     1
116

In [10]:
### for each object column in data

cols_toOneHot = df_data.select_dtypes(include='object').columns.tolist()

cols_all = df_data.columns.tolist()
cols_toNormalize = list(set(cols_all) - set(cols_toOneHot))

In [11]:
print("columns to normalize:\n%s\n" % cols_toNormalize)
print("columns to one-hot:\n%s\n" % cols_toOneHot)

columns to normalize:
['dy', 'el', 'ij', 'bk', 'bl', 'am', 'jr', 'fd', 'bg', 'iz', 'eh', 'fu', 'fz', 'fh', 'hv', 'fe', 'id', 'gb', 'aw', 'ec', 'dx', 'ie', 'az', 'ed', 'bd', 'hg', 'dv', 'cf', 'cq', 'ek', 'cj', 'gc', 'bb', 'by', 'gx', 'fm', 'gj', 'ga', 'hz', 'fo', 'fw', 'ff', 'fv', 'i', 'hq', 'd', 'da', 'eu', 'ag', 'is', 'fl', 'eo', 'bj', 'ey', 'dl', 'as', 'df', 'hu', 'db', 'jg', 'cl', 'dq', 'bu', 'cn', 'ac', 'hh', 'ih', 'io', 'ip', 'w', 'dg', 'eg', 'ck', 'ci', 'eq', 'fp', 'ik', 'iy', 'aa', 'hn', 'jf', 'ha', 'ba', 'hj', 'm', 'gp', 'cr', 'ar', 'gz', 'iv', 'il', 'dp', 'gf', 'aj', 'em', 'gn', 'hx', 'he', 'er', 'r', 'bq', 'en', 'ew', 'au', 'af', 'ab', 'es', 'fi', 'fs', 'ji', 'gt', 'gu', 'ev', 'hw', 'hf', 'f', 'cy', 'dd', 'ct', 'hd', 'dc', 'go', 'ig', 'du', 'cd', 'dt', 'jn', 'l', 'ib', 'cg', 'k', 'et', 'p', 'gk', 'dm', 'jq', 'jp', 'o', 'bz', 'ej', 'ak', 'al', 'it', 'dr', 'cw', 'cp', 's', 'h', 'gr', 'ii', 'cs', 'a', 'u', 'bm', 'cu', 'ad', 'hl', 'gq', 'jc', 'ic', 'aq', 'ix', 'if', 'hb', 'ae', '

In [12]:
len(cols_toNormalize)

279

### Dataset preprocessing in numerical columns

In [13]:
print(df_data.aa.dtype)

float64


In [14]:
def checkStats(df_data):
    i=0
    for col in df_data.columns.tolist():
            
        if col in cols_toOneHot:    ### skip if column is not numerical  ### TODO: check column type??
            i += 1
            continue

        min_ = df_data[col].values.min()
        max_ = df_data[col].values.max()
        std_ = df_data[col].values.std()

        print("column:%d min:%f max:%f std:%f" % (i, min_, max_, std_))
        i += 1

In [15]:
## before normalization
df_data = df_data
checkStats(df_data)

column:0 min:0.000000 max:83.000000 std:16.448406
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:105.000000 max:780.000000 std:37.129200
column:3 min:6.000000 max:176.000000 std:16.572440
column:4 min:55.000000 max:188.000000 std:15.347388
column:5 min:0.000000 max:524.000000 std:44.792651
column:6 min:232.000000 max:509.000000 std:33.348470
column:7 min:108.000000 max:381.000000 std:35.593633
column:8 min:0.000000 max:205.000000 std:25.798058
column:9 min:-172.000000 max:169.000000 std:45.381150
column:10 min:-177.000000 max:179.000000 std:57.477332
column:11 min:-170.000000 max:176.000000 std:30.465943
column:12 min:-135.000000 max:166.000000 std:35.982320
column:13 min:-179.000000 max:178.000000 std:52.071135
column:14 min:0.000000 max:163.000000 std:14.275317
column:15 min:0.000000 max:88.000000 std:10.638214
column:16 min:0.000000 max:156.000000 std:18.229702
column:17 min:0.000000 max:88.000000 std:20.518993
column:18 min:0.000000 max:24.000000 std:1.567746
column:1

column:167 min:-33.300000 max:155.200000 std:13.178315
column:168 min:-38.800000 max:74.300000 std:13.340714
column:169 min:-3.900000 max:1.900000 std:0.523002
column:170 min:-3.400000 max:0.000000 std:0.555600
column:171 min:0.000000 max:19.200000 std:3.486454
column:172 min:-16.500000 max:0.000000 std:1.809051
column:173 min:0.000000 max:3.200000 std:0.176047
column:174 min:-1.500000 max:0.000000 std:0.090156
column:175 min:-1.500000 max:3.400000 std:0.552718
column:176 min:-4.400000 max:7.200000 std:1.340608
column:177 min:-43.000000 max:64.600000 std:14.288796
column:178 min:-38.500000 max:87.100000 std:18.448700
column:179 min:-1.700000 max:5.100000 std:0.546669
column:180 min:-16.500000 max:0.000000 std:1.919355
column:181 min:0.000000 max:21.700000 std:3.445313
column:182 min:-16.300000 max:0.000000 std:2.849865
column:183 min:0.000000 max:14.900000 std:0.877878
column:184 min:-1.700000 max:0.000000 std:0.130013
column:185 min:-2.800000 max:2.200000 std:0.546235
column:186 min:-

In [16]:
### normalization

### min substr and max-min div

def normalise(df_data, cols_toNormalize):
    for col in cols_toNormalize:
            
        min = df_data[col].min()
        max = df_data[col].max()

        if max - min == 0:
            continue

        df_data[col] = (df_data[col] - min) / (max - min)
    
    return df_data



"""
### mean substr and std dev div
for col in cols_toNormalize:
    mean = df_data[col].mean()
    std = df_data[col].std()
    
    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!
        continue
    
    df_data[col] = (df_data[col] - mean) / std
"""

"\n### mean substr and std dev div\nfor col in cols_toNormalize:\n    mean = df_data[col].mean()\n    std = df_data[col].std()\n    \n    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!\n        continue\n    \n    df_data[col] = (df_data[col] - mean) / std\n"

In [17]:
# ### Skipping normalisation for thyroid dataset. Added by Jyotirmay
df_data=normalise(df_data, cols_toNormalize)
checkStats(df_data)


column:0 min:0.000000 max:1.000000 std:0.198174
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:0.000000 max:1.000000 std:0.055006
column:3 min:0.000000 max:1.000000 std:0.097485
column:4 min:0.000000 max:1.000000 std:0.115394
column:5 min:0.000000 max:1.000000 std:0.085482
column:6 min:0.000000 max:1.000000 std:0.120392
column:7 min:0.000000 max:1.000000 std:0.130380
column:8 min:0.000000 max:1.000000 std:0.125844
column:9 min:0.000000 max:1.000000 std:0.133083
column:10 min:0.000000 max:1.000000 std:0.161453
column:11 min:0.000000 max:1.000000 std:0.088052
column:12 min:0.000000 max:1.000000 std:0.119543
column:13 min:0.000000 max:1.000000 std:0.145858
column:14 min:0.000000 max:1.000000 std:0.087579
column:15 min:0.000000 max:1.000000 std:0.120889
column:16 min:0.000000 max:1.000000 std:0.116857
column:17 min:0.000000 max:1.000000 std:0.233170
column:18 min:0.000000 max:1.000000 std:0.065323
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:1.0

### One hot encoding of required columns

In [18]:
### obtain one hot encoding

df_data = pd.get_dummies(df_data, columns=cols_toOneHot)

- Check the resulting dataset

In [19]:
df_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jl,jm,jn,jo,jp,jq,jr,js,label_anomaly,label_normal
8,0.53012,0.0,0.093333,0.294118,0.218045,0.225191,0.440433,0.190476,0.307317,0.683284,...,0.29661,0.954545,0.0,0.0,0.4375,0.675,0.426316,0.449838,0,1
67,0.313253,1.0,0.081481,0.347059,0.120301,0.28626,0.425993,0.208791,0.395122,0.697947,...,0.512712,1.0,0.0,0.0,0.15625,0.708333,0.530827,0.530744,0,1
338,0.46988,1.0,0.081481,0.376471,0.240602,0.305344,0.451264,0.25641,0.434146,0.697947,...,0.355932,0.951049,0.0,0.0,0.375,0.716667,0.464662,0.535275,0,1
309,0.481928,0.0,0.105185,0.4,0.278195,0.412214,0.469314,0.194139,0.419512,0.747801,...,0.228814,1.0,0.0,0.0,0.25,0.616667,0.437594,0.416181,1,0
5,0.156627,0.0,0.094815,0.264706,0.338346,0.318702,0.3213,0.241758,0.443902,0.818182,...,0.516949,0.902098,0.0,0.0,0.53125,0.683333,0.433835,0.451133,1,0


In [20]:
### shape is correct

df_data.shape

(452, 281)

In [21]:
df_data.describe()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jl,jm,jn,jo,jp,jq,jr,js,label_anomaly,label_normal
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,...,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0
mean,0.559894,0.550885,0.090649,0.365708,0.25504,0.296093,0.488115,0.22692,0.439046,0.603158,...,0.38339,0.949045,0.004978,0.0,0.410882,0.601862,0.47764,0.440603,0.146018,0.853982
std,0.198393,0.497955,0.055067,0.097593,0.115522,0.085577,0.120525,0.130524,0.125984,0.13323,...,0.147155,0.070015,0.062648,0.0,0.108603,0.118838,0.101533,0.119702,0.353515,0.353515
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.433735,0.0,0.081481,0.311765,0.18797,0.270992,0.425993,0.14652,0.385366,0.515396,...,0.279661,0.926573,0.0,0.0,0.375,0.541667,0.418421,0.36343,0.0,1.0
50%,0.566265,1.0,0.087407,0.364706,0.233083,0.299618,0.487365,0.197802,0.443902,0.621701,...,0.372881,0.961538,0.0,0.0,0.40625,0.6125,0.468421,0.430421,0.0,1.0
75%,0.698795,1.0,0.096296,0.429412,0.293233,0.333969,0.548736,0.260073,0.497561,0.697947,...,0.474576,1.0,0.0,0.0,0.46875,0.675,0.526504,0.516019,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
### check again

checkStats(df_data)

column:0 min:0.000000 max:1.000000 std:0.198174
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:0.000000 max:1.000000 std:0.055006
column:3 min:0.000000 max:1.000000 std:0.097485
column:4 min:0.000000 max:1.000000 std:0.115394
column:5 min:0.000000 max:1.000000 std:0.085482
column:6 min:0.000000 max:1.000000 std:0.120392
column:7 min:0.000000 max:1.000000 std:0.130380
column:8 min:0.000000 max:1.000000 std:0.125844
column:9 min:0.000000 max:1.000000 std:0.133083
column:10 min:0.000000 max:1.000000 std:0.161453
column:11 min:0.000000 max:1.000000 std:0.088052
column:12 min:0.000000 max:1.000000 std:0.119543
column:13 min:0.000000 max:1.000000 std:0.145858
column:14 min:0.000000 max:1.000000 std:0.087579
column:15 min:0.000000 max:1.000000 std:0.120889
column:16 min:0.000000 max:1.000000 std:0.116857
column:17 min:0.000000 max:1.000000 std:0.233170
column:18 min:0.000000 max:1.000000 std:0.065323
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:1.0

In [23]:
## check integrity of the data again

print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


In [24]:
### check again
print(df_data.iloc[140:150, -1])

print(df_data.iloc[140:150, -2])

print(df_labels.iloc[140:150, -1])

273    1
390    1
137    1
86     1
406    1
379    1
144    1
330    1
58     1
182    1
Name: label_normal, dtype: uint8
273    0
390    0
137    0
86     0
406    0
379    0
144    0
330    0
58     0
182    0
Name: label_anomaly, dtype: uint8
273     1
390     1
137    10
86      2
406     1
379    10
144    10
330     1
58      6
182     1
Name: labels, dtype: object


### Save dataset

In [25]:
dataset = df_data.values

In [26]:
dataset.shape

(452, 281)

In [27]:
np.savez_compressed("../../datasets/arythmia/arythmia.npz", dataset=dataset)

### Produce train and test splits

In [28]:
random_state = None

In [29]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(dataset, test_size=0.5, random_state=random_state)

In [30]:
x_train.shape

(226, 281)

In [31]:
x_test.shape

(226, 281)

-  Normal class is anomaly (attack) in this case, because non-attack rows are minority

In [32]:
x_train_normal = x_train[np.where(x_train[:,-1] == 1)]   ##last column is the normal column
x_train_anomaly = x_train[np.where(x_train[:,-1] == 0)]   

In [33]:
x_train_normal.shape

(189, 281)

In [34]:
x_train_anomaly.shape   

(37, 281)

In [35]:
x_train = x_train_normal

- x_train consist of "normal" values

In [36]:
x_train.shape

(189, 281)

In [37]:
x_test.shape

(226, 281)

- Assign arbitrary output as labels

In [38]:
y_train = np.zeros((len(x_train),2))
y_train[:,0] = 1
y_train.shape

(189, 2)

In [39]:
train_name = "../../datasets/arythmia/arythmia_train-randomState_"+str(random_state)+".npz"
test_name = "../../datasets/arythmia/arythmia_test-randomState_"+str(random_state)+".npz"

np.savez_compressed(train_name, x_train=x_train, y_train=y_train)
np.savez_compressed(test_name, x_test=x_test)