### Dataset creation

In [251]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

- load data and labels (normal and different attack types)

In [252]:
df_data = pd.read_csv("../datasets/arrhythmia/arrhythmia.csv", na_values='?').fillna(0)
df_data = shuffle(df_data)
df_labels = pd.DataFrame(df_data.iloc[:,-1]).astype(str)   
df_data = pd.DataFrame(df_data.iloc[:, :-1]).astype(float) 

- read column names and types

In [253]:
# col_names=[]
# col_datatypes=[]

# with open("datasets/kddcup/kddcup.names") as file:
#     next(file)    ### skip first line
#     for line in file:
#         name, datatype = line.split(": ")
#         col_names.append(name)
#         col_datatypes.append(datatype.replace(".\n",""))
        
# df_data.columns = col_names
df_labels.columns = ["labels"]

In [255]:
df_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jj,jk,jl,jm,jn,jo,jp,jq,jr,js
389,70.0,0.0,178.0,80.0,93.0,118.0,258.0,155.0,82.0,-31.0,...,-1.5,0.0,15.7,-2.1,0.0,0.0,0.5,-1.3,32.6,22.5
441,37.0,1.0,160.0,50.0,74.0,143.0,374.0,146.0,75.0,68.0,...,0.0,0.0,11.4,-0.9,0.0,0.0,0.7,1.8,40.1,55.5
353,26.0,0.0,170.0,43.0,92.0,0.0,363.0,151.0,204.0,-32.0,...,-0.4,0.0,8.1,0.0,0.0,0.0,0.3,1.0,21.0,28.4
251,58.0,0.0,175.0,78.0,95.0,145.0,376.0,202.0,92.0,-5.0,...,-0.2,-2.4,9.5,0.0,0.0,0.0,0.7,0.2,17.6,18.8
171,50.0,0.0,168.0,80.0,95.0,159.0,358.0,166.0,96.0,-50.0,...,-0.2,0.0,8.1,-6.5,0.0,0.0,0.6,0.8,-5.9,0.1


- check integrity of the data

In [256]:
print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


### Enumerate labels 

- ### normal samples are named as "anomaly" since they are minority (stated in paper)

In [257]:
# Considering n labelled data as normal and others as anomaly in thyroid dataset. assigning labels accordingly.
# Added by Jyotirmay.
# 1 = normal, 0 = anomaly
df_data["label"] = np.where(df_labels["labels"].str.contains("3|4|5|7|8|9|14|15", regex=True), 'anomaly', 'normal')
print(df_data['label'])

389     normal
441     normal
353     normal
251    anomaly
171     normal
13      normal
1       normal
137     normal
90      normal
70      normal
379     normal
43      normal
289     normal
384     normal
24      normal
360     normal
316    anomaly
112     normal
403    anomaly
286     normal
238     normal
106     normal
429     normal
0      anomaly
65      normal
235     normal
348    anomaly
196     normal
305     normal
241     normal
        ...   
64      normal
76     anomaly
17      normal
150     normal
72      normal
44      normal
211     normal
117     normal
435     normal
266     normal
287     normal
300    anomaly
121     normal
86      normal
73      normal
61     anomaly
123     normal
140     normal
82      normal
428     normal
432     normal
253    anomaly
387    anomaly
423     normal
313     normal
381    anomaly
144     normal
160     normal
351     normal
220     normal
Name: label, Length: 452, dtype: object


In [258]:
### number of normal and abnormal
print("number of normal:", len(np.where(df_data["label"] == 'normal')[0]))
print("number of anomaly:", len(np.where(df_data["label"] == 'anomaly')[0]))

number of normal: 386
number of anomaly: 66


In [259]:
## check in which location data is abnormal

np.where(df_data["label"] == 'anomaly')

(array([  3,  16,  18,  23,  26,  42,  64,  67,  73,  90,  91,  94,  95,
        101, 103, 110, 113, 114, 136, 137, 141, 146, 155, 157, 163, 165,
        166, 179, 191, 192, 194, 205, 209, 212, 213, 221, 236, 250, 261,
        264, 266, 275, 277, 290, 308, 314, 324, 329, 349, 350, 355, 360,
        387, 389, 394, 404, 412, 413, 416, 420, 423, 433, 437, 443, 444,
        447]),)

In [261]:
### looks like working

print(df_data.iloc[100:150, -1])
print(df_labels.iloc[100:150, -1])
# df_data.head()

341     normal
189    anomaly
215     normal
356    anomaly
233     normal
165     normal
56      normal
306     normal
295     normal
155     normal
5      anomaly
273     normal
438     normal
45     anomaly
100    anomaly
162     normal
301     normal
79      normal
29      normal
261     normal
270     normal
242     normal
80      normal
157     normal
283     normal
380     normal
111     normal
448     normal
304     normal
344     normal
302     normal
131     normal
74      normal
193     normal
212     normal
161     normal
169    anomaly
26     anomaly
98      normal
349     normal
442     normal
410    anomaly
447     normal
129     normal
147     normal
115     normal
204    anomaly
182     normal
81      normal
219     normal
Name: label, dtype: object
341    10
189     9
215     1
356     3
233     1
165     1
56      1
306    16
295     2
155     1
5      14
273     1
438     1
45      4
100     4
162     1
301     1
79      1
29      2
261    16
270     1
242     1
80 

In [265]:
### for each object column in data

cols_toOneHot = df_data.select_dtypes(include='object').columns.tolist()

cols_all = df_data.columns.tolist()
cols_toNormalize = list(set(cols_all) - set(cols_toOneHot))

In [266]:
print("columns to normalize:\n%s\n" % cols_toNormalize)
print("columns to one-hot:\n%s\n" % cols_toOneHot)

columns to normalize:
['aw', 't', 'fa', 'ec', 'gi', 'hr', 'gg', 'dc', 'er', 'cx', 'io', 'jm', 'cy', 'dx', 'ef', 'bl', 'fp', 'gb', 'ds', 'ik', 'ch', 'eu', 'cu', 'ay', 'jp', 'df', 'bo', 'bs', 'al', 'bj', 'bh', 'ej', 'ar', 'gm', 'jk', 'w', 'ai', 'ir', 'at', 'ew', 'da', 'hy', 'x', 'hz', 'fx', 'ha', 'ek', 'fl', 'ga', 'dz', 'hf', 'gr', 'em', 'u', 'dg', 'jl', 'ix', 'gd', 'gk', 'cv', 'cb', 'is', 'by', 'ci', 'di', 'hp', 'ih', 'ea', 'ex', 'fz', 'fr', 'dl', 'hi', 'au', 'ey', 'hq', 'ax', 'as', 'bu', 'de', 'q', 'il', 'o', 'cg', 'dn', 'an', 'js', 'in', 'iu', 'hh', 'eo', 'id', 'p', 'jq', 'ak', 'af', 'r', 'dh', 'gy', 'j', 'hn', 'jb', 'bp', 'bq', 'ib', 'du', 'e', 'ac', 'ba', 'fe', 'gv', 'es', 'gx', 'cc', 'dq', 'ja', 'ez', 'if', 'gl', 'jd', 'eg', 'cp', 'ep', 'cj', 'iv', 'iy', 'bk', 'ff', 'ag', 'dm', 'gn', 'ic', 'jg', 'gj', 'ig', 'jn', 'hk', 'i', 'fg', 'co', 'ev', 'fn', 'dr', 'cs', 'bm', 'gt', 'gp', 'am', 'fs', 'fq', 'gs', 'l', 'fm', 'gc', 'bz', 'hg', 'dt', 'cl', 'ca', 'ee', 'bn', 'ae', 'iq', 'he', 'gw',

In [267]:
len(cols_toNormalize)

279

### Dataset preprocessing in numerical columns

In [269]:
print(df_data.aa.dtype)

float64


In [270]:
def checkStats(df_data):
    i=0
    for col in df_data.columns.tolist():
            
        if col in cols_toOneHot:    ### skip if column is not numerical  ### TODO: check column type??
            i += 1
            continue

        min_ = df_data[col].values.min()
        max_ = df_data[col].values.max()
        std_ = df_data[col].values.std()

        print("column:%d min:%f max:%f std:%f" % (i, min_, max_, std_))
        i += 1

In [271]:
## before normalization
df_data = df_data
checkStats(df_data)

column:0 min:0.000000 max:83.000000 std:16.448406
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:105.000000 max:780.000000 std:37.129200
column:3 min:6.000000 max:176.000000 std:16.572440
column:4 min:55.000000 max:188.000000 std:15.347388
column:5 min:0.000000 max:524.000000 std:44.792651
column:6 min:232.000000 max:509.000000 std:33.348470
column:7 min:108.000000 max:381.000000 std:35.593633
column:8 min:0.000000 max:205.000000 std:25.798058
column:9 min:-172.000000 max:169.000000 std:45.381150
column:10 min:-177.000000 max:179.000000 std:57.477332
column:11 min:-170.000000 max:176.000000 std:30.465943
column:12 min:-135.000000 max:166.000000 std:35.982320
column:13 min:-179.000000 max:178.000000 std:52.071135
column:14 min:0.000000 max:163.000000 std:14.275317
column:15 min:0.000000 max:88.000000 std:10.638214
column:16 min:0.000000 max:156.000000 std:18.229702
column:17 min:0.000000 max:88.000000 std:20.518993
column:18 min:0.000000 max:24.000000 std:1.567746
column:1

In [274]:
### normalization

### min substr and max-min div

def normalise(df_data, cols_toNormalize):
    for col in cols_toNormalize:
            
        min = df_data[col].min()
        max = df_data[col].max()

        if max - min == 0:
            continue

        df_data[col] = (df_data[col] - min) / (max - min)
    
    return df_data



"""
### mean substr and std dev div
for col in cols_toNormalize:
    mean = df_data[col].mean()
    std = df_data[col].std()
    
    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!
        continue
    
    df_data[col] = (df_data[col] - mean) / std
"""

"\n### mean substr and std dev div\nfor col in cols_toNormalize:\n    mean = df_data[col].mean()\n    std = df_data[col].std()\n    \n    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!\n        continue\n    \n    df_data[col] = (df_data[col] - mean) / std\n"

In [276]:
# ### Skipping normalisation for thyroid dataset. Added by Jyotirmay
df_data=normalise(df_data, cols_toNormalize)
checkStats(df_data)


column:0 min:0.000000 max:1.000000 std:0.198174
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:0.000000 max:1.000000 std:0.055006
column:3 min:0.000000 max:1.000000 std:0.097485
column:4 min:0.000000 max:1.000000 std:0.115394
column:5 min:0.000000 max:1.000000 std:0.085482
column:6 min:0.000000 max:1.000000 std:0.120392
column:7 min:0.000000 max:1.000000 std:0.130380
column:8 min:0.000000 max:1.000000 std:0.125844
column:9 min:0.000000 max:1.000000 std:0.133083
column:10 min:0.000000 max:1.000000 std:0.161453
column:11 min:0.000000 max:1.000000 std:0.088052
column:12 min:0.000000 max:1.000000 std:0.119543
column:13 min:0.000000 max:1.000000 std:0.145858
column:14 min:0.000000 max:1.000000 std:0.087579
column:15 min:0.000000 max:1.000000 std:0.120889
column:16 min:0.000000 max:1.000000 std:0.116857
column:17 min:0.000000 max:1.000000 std:0.233170
column:18 min:0.000000 max:1.000000 std:0.065323
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:1.0

### One hot encoding of required columns

In [277]:
### obtain one hot encoding

df_data = pd.get_dummies(df_data, columns=cols_toOneHot)

- Check the resulting dataset

In [278]:
df_data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jl,jm,jn,jo,jp,jq,jr,js,label_anomaly,label_normal
389,0.843373,0.0,0.108148,0.435294,0.285714,0.225191,0.093863,0.172161,0.4,0.41349,...,0.665254,0.926573,0.0,0.0,0.40625,0.391667,0.577444,0.395469,0,1
441,0.445783,1.0,0.081481,0.258824,0.142857,0.272901,0.512635,0.139194,0.365854,0.703812,...,0.483051,0.968531,0.0,0.0,0.46875,0.65,0.633835,0.609061,0,1
353,0.313253,0.0,0.096296,0.217647,0.278195,0.0,0.472924,0.157509,0.995122,0.410557,...,0.34322,1.0,0.0,0.0,0.34375,0.583333,0.490226,0.433657,0,1
251,0.698795,0.0,0.103704,0.423529,0.300752,0.276718,0.519856,0.344322,0.44878,0.489736,...,0.402542,1.0,0.0,0.0,0.46875,0.516667,0.464662,0.371521,1,0
171,0.60241,0.0,0.093333,0.435294,0.300752,0.303435,0.454874,0.212454,0.468293,0.357771,...,0.34322,0.772727,0.0,0.0,0.4375,0.566667,0.28797,0.250485,0,1


In [279]:
### shape is correct

df_data.shape

(452, 281)

In [280]:
df_data.describe()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,jl,jm,jn,jo,jp,jq,jr,js,label_anomaly,label_normal
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,...,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0,452.0
mean,0.559894,0.550885,0.090649,0.365708,0.25504,0.296093,0.488115,0.22692,0.439046,0.603158,...,0.38339,0.949045,0.004978,0.0,0.410882,0.601862,0.47764,0.440603,0.146018,0.853982
std,0.198393,0.497955,0.055067,0.097593,0.115522,0.085577,0.120525,0.130524,0.125984,0.13323,...,0.147155,0.070015,0.062648,0.0,0.108603,0.118838,0.101533,0.119702,0.353515,0.353515
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.433735,0.0,0.081481,0.311765,0.18797,0.270992,0.425993,0.14652,0.385366,0.515396,...,0.279661,0.926573,0.0,0.0,0.375,0.541667,0.418421,0.36343,0.0,1.0
50%,0.566265,1.0,0.087407,0.364706,0.233083,0.299618,0.487365,0.197802,0.443902,0.621701,...,0.372881,0.961538,0.0,0.0,0.40625,0.6125,0.468421,0.430421,0.0,1.0
75%,0.698795,1.0,0.096296,0.429412,0.293233,0.333969,0.548736,0.260073,0.497561,0.697947,...,0.474576,1.0,0.0,0.0,0.46875,0.675,0.526504,0.516019,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [281]:
### check again

checkStats(df_data)

column:0 min:0.000000 max:1.000000 std:0.198174
column:1 min:0.000000 max:1.000000 std:0.497404
column:2 min:0.000000 max:1.000000 std:0.055006
column:3 min:0.000000 max:1.000000 std:0.097485
column:4 min:0.000000 max:1.000000 std:0.115394
column:5 min:0.000000 max:1.000000 std:0.085482
column:6 min:0.000000 max:1.000000 std:0.120392
column:7 min:0.000000 max:1.000000 std:0.130380
column:8 min:0.000000 max:1.000000 std:0.125844
column:9 min:0.000000 max:1.000000 std:0.133083
column:10 min:0.000000 max:1.000000 std:0.161453
column:11 min:0.000000 max:1.000000 std:0.088052
column:12 min:0.000000 max:1.000000 std:0.119543
column:13 min:0.000000 max:1.000000 std:0.145858
column:14 min:0.000000 max:1.000000 std:0.087579
column:15 min:0.000000 max:1.000000 std:0.120889
column:16 min:0.000000 max:1.000000 std:0.116857
column:17 min:0.000000 max:1.000000 std:0.233170
column:18 min:0.000000 max:1.000000 std:0.065323
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:1.0

In [282]:
## check integrity of the data again

print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


In [284]:
### check again
print(df_data.iloc[140:150, -1])

print(df_data.iloc[140:150, -2])

print(df_labels.iloc[140:150, -1])

442    1
410    0
447    1
129    1
147    1
115    1
204    0
182    1
81     1
219    1
Name: label_normal, dtype: uint8
442    0
410    1
447    0
129    0
147    0
115    0
204    1
182    0
81     0
219    0
Name: label_anomaly, dtype: uint8
442     1
410     4
447     1
129     1
147     1
115     1
204     3
182     1
81      1
219    16
Name: labels, dtype: object


### Save dataset

In [285]:
dataset = df_data.values

In [286]:
dataset.shape

(452, 281)

In [287]:
np.savez_compressed("arythmia.npz", dataset=dataset)

### Produce train and test splits

In [288]:
random_state = None

In [289]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(dataset, test_size=0.5, random_state=random_state)

In [290]:
x_train.shape

(226, 281)

In [291]:
x_test.shape

(226, 281)

-  Normal class is anomaly (attack) in this case, because non-attack rows are minority

In [292]:
x_train_normal = x_train[np.where(x_train[:,-1] == 1)]   ##last column is the normal column
x_train_anomaly = x_train[np.where(x_train[:,-1] == 0)]   

In [293]:
x_train_normal.shape

(194, 281)

In [294]:
x_train_anomaly.shape   

(32, 281)

In [295]:
x_train = x_train_normal

- x_train consist of "normal" values

In [296]:
x_train.shape

(194, 281)

In [297]:
x_test.shape

(226, 281)

- Assign arbitrary output as labels

In [298]:
y_train = np.zeros((len(x_train),2))
y_train[:,0] = 1
y_train.shape

(194, 2)

In [299]:
train_name = "arythmia_train-randomState_"+str(random_state)+".npz"
test_name = "arythmia_test-randomState_"+str(random_state)+".npz"

np.savez_compressed(train_name, x_train=x_train, y_train=y_train)
np.savez_compressed(test_name, x_test=x_test)