### Dataset creation

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

- load data and labels (normal and different attack types)

In [2]:
df_data = pd.read_csv("../../datasets/thyroid/thyroid.csv")
df_data = shuffle(df_data)
df_labels = pd.DataFrame(df_data.iloc[:,-1])   
df_data = pd.DataFrame(df_data.iloc[:, :-1]).astype(float) 

- read column names and types

In [3]:
# col_names=[]
# col_datatypes=[]

# with open("datasets/kddcup/kddcup.names") as file:
#     next(file)    ### skip first line
#     for line in file:
#         name, datatype = line.split(": ")
#         col_names.append(name)
#         col_datatypes.append(datatype.replace(".\n",""))
        
# df_data.columns = col_names
df_labels.columns = ["labels"]

In [4]:
df_data.head()

Unnamed: 0,age,a,b,c,d,e,f,g,h,i,...,k,l,m,n,o,TSH,T3,TT4,T4U,FTI
2776,0.66,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0026,0.0201,0.085,0.07,0.121
307,0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6e-05,0.022,0.137,0.102,0.135
4801,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.002,0.017,0.144,0.094,0.153
1894,0.41,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0016,0.031,0.061,0.093,0.066
6722,0.29,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.015,0.0206,0.113,0.113,0.1


- check integrity of the data

In [5]:
print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


### Enumerate labels 

- ### normal samples are named as "anomaly" since they are minority (stated in paper)

In [6]:
# Considering n labelled data as normal and others as anomaly in thyroid dataset. assigning labels accordingly.
# Added by Jyotirmay.
# 1 = normal, 0 = anomaly
df_data["label"] = np.where(df_labels["labels"].str.contains("n"), 'normal', 'anomaly')
print(type(df_data.TSH[0]))

<class 'numpy.float64'>


In [7]:
### number of normal and abnormal
print("number of normal:", len(np.where(df_data["label"] == 'normal')[0]))
print("number of anomaly:", len(np.where(df_data["label"] == 'anomaly')[0]))

number of normal: 6603
number of anomaly: 250


In [8]:
## check in which location data is abnormal

np.where(df_data["label"] == 'normal')

(array([   0,    1,    2, ..., 6850, 6851, 6852]),)

In [9]:
### looks like working

print(df_data.iloc[740:750, -1])
print(df_labels.iloc[740:750, -1])
# df_data.head()

3376    normal
427     normal
2371    normal
4946    normal
6683    normal
5805    normal
1560    normal
3240    normal
430     normal
4696    normal
Name: label, dtype: object
3376    n
427     n
2371    n
4946    n
6683    n
5805    n
1560    n
3240    n
430     n
4696    n
Name: labels, dtype: object


In [10]:
### for each object column in data

cols_toOneHot = df_data.select_dtypes(include='object').columns.tolist()

cols_all = df_data.columns.tolist()
cols_toNormalize = list(set(cols_all) - set(cols_toOneHot))

In [11]:
print("columns to normalize:\n%s\n" % cols_toNormalize)
print("columns to one-hot:\n%s\n" % cols_toOneHot)

columns to normalize:
['n', 'e', 'j', 'g', 'TT4', 'f', 'TSH', 'd', 'm', 'T4U', 'c', 'i', 'age', 'a', 'FTI', 'T3', 'k', 'l', 'o', 'h', 'b']

columns to one-hot:
['label']



In [12]:
len(cols_toNormalize)

21

### Dataset preprocessing in numerical columns

In [13]:
print(df_data.TSH.dtype)

float64


In [14]:
def checkStats(df_data):
    i=0
    for col in df_data.columns.tolist():
            
        if col in cols_toOneHot:    ### skip if column is not numerical  ### TODO: check column type??
            i += 1
            continue

        min_ = df_data[col].values.min()
        max_ = df_data[col].values.max()
        std_ = df_data[col].values.std()

        print("column:%d min:%f max:%f std:%f" % (i, min_, max_, std_))
        i += 1

In [15]:
## before normalization
df_data = df_data
checkStats(df_data)

column:0 min:0.010000 max:0.970000 std:0.189447
column:1 min:0.000000 max:1.000000 std:0.461501
column:2 min:0.000000 max:1.000000 std:0.340448
column:3 min:0.000000 max:1.000000 std:0.123975
column:4 min:0.000000 max:1.000000 std:0.113844
column:5 min:0.000000 max:1.000000 std:0.192456
column:6 min:0.000000 max:1.000000 std:0.106077
column:7 min:0.000000 max:1.000000 std:0.118127
column:8 min:0.000000 max:1.000000 std:0.128450
column:9 min:0.000000 max:1.000000 std:0.242769
column:10 min:0.000000 max:1.000000 std:0.254232
column:11 min:0.000000 max:1.000000 std:0.113218
column:12 min:0.000000 max:1.000000 std:0.092386
column:13 min:0.000000 max:1.000000 std:0.157747
column:14 min:0.000000 max:1.000000 std:0.012079
column:15 min:0.000000 max:1.000000 std:0.217447
column:16 min:0.000010 max:0.500000 std:0.016379
column:17 min:0.000500 max:0.180000 std:0.007451
column:18 min:0.002500 max:0.600000 std:0.035017
column:19 min:0.017000 max:0.233000 std:0.019091
column:20 min:0.002400 max:0.6

In [16]:
### normalization

### min substr and max-min div

def normalise(df_data, cols_toNormalize):
    for col in cols_toNormalize:
        if len(col) == 1:
            print(col)
            continue
            
        min = df_data[col].min()
        max = df_data[col].max()

        if max - min == 0:
            continue

        df_data[col] = (df_data[col] - min) / (max - min)
    
    return df_data



"""
### mean substr and std dev div
for col in cols_toNormalize:
    mean = df_data[col].mean()
    std = df_data[col].std()
    
    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!
        continue
    
    df_data[col] = (df_data[col] - mean) / std
"""

"\n### mean substr and std dev div\nfor col in cols_toNormalize:\n    mean = df_data[col].mean()\n    std = df_data[col].std()\n    \n    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!\n        continue\n    \n    df_data[col] = (df_data[col] - mean) / std\n"

In [17]:
# ### Skipping normalisation for thyroid dataset. Added by Jyotirmay
df_data=normalise(df_data, cols_toNormalize)
checkStats(df_data)
print(df_data)

n
e
j
g
f
d
m
c
i
a
k
l
o
h
b
column:0 min:0.000000 max:1.000000 std:0.197341
column:1 min:0.000000 max:1.000000 std:0.461501
column:2 min:0.000000 max:1.000000 std:0.340448
column:3 min:0.000000 max:1.000000 std:0.123975
column:4 min:0.000000 max:1.000000 std:0.113844
column:5 min:0.000000 max:1.000000 std:0.192456
column:6 min:0.000000 max:1.000000 std:0.106077
column:7 min:0.000000 max:1.000000 std:0.118127
column:8 min:0.000000 max:1.000000 std:0.128450
column:9 min:0.000000 max:1.000000 std:0.242769
column:10 min:0.000000 max:1.000000 std:0.254232
column:11 min:0.000000 max:1.000000 std:0.113218
column:12 min:0.000000 max:1.000000 std:0.092386
column:13 min:0.000000 max:1.000000 std:0.157747
column:14 min:0.000000 max:1.000000 std:0.012079
column:15 min:0.000000 max:1.000000 std:0.217447
column:16 min:0.000000 max:1.000000 std:0.032758
column:17 min:0.000000 max:1.000000 std:0.041511
column:18 min:0.000000 max:1.000000 std:0.058607
column:19 min:0.000000 max:1.000000 std:0.088386


### One hot encoding of required columns

In [18]:
### obtain one hot encoding

df_data = pd.get_dummies(df_data, columns=cols_toOneHot)

- Check the resulting dataset

In [19]:
df_data.head()

Unnamed: 0,age,a,b,c,d,e,f,g,h,i,...,m,n,o,TSH,T3,TT4,T4U,FTI,label_anomaly,label_normal
2776,0.677083,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00518,0.109192,0.138075,0.24537,0.185428,0,1
307,0.78125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0001,0.119777,0.225105,0.393519,0.207317,0,1
4801,0.59375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00398,0.091922,0.23682,0.356481,0.23546,0,1
1894,0.416667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00318,0.169916,0.097908,0.351852,0.099437,0,1
6722,0.291667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.029981,0.111978,0.184937,0.444444,0.152595,0,1


In [20]:
### shape is correct

df_data.shape

(6853, 23)

In [21]:
df_data.describe()

Unnamed: 0,age,a,b,c,d,e,f,g,h,i,...,m,n,o,TSH,T3,TT4,T4U,FTI,label_anomaly,label_normal
count,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,...,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0,6853.0
mean,0.531639,0.307603,0.13381,0.015614,0.013133,0.038523,0.011382,0.014154,0.016781,0.062892,...,0.025536,0.000146,0.049759,0.006882,0.109418,0.181021,0.373549,0.175391,0.03648,0.96352
std,0.197355,0.461535,0.340473,0.123984,0.113852,0.19247,0.106085,0.118136,0.128459,0.242787,...,0.157759,0.01208,0.217463,0.03276,0.041514,0.058611,0.088393,0.05468,0.187496,0.187496
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00138,0.091922,0.146444,0.324074,0.146341,0.0,1.0
50%,0.552083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00318,0.109192,0.174895,0.365741,0.171357,0.0,1.0
75%,0.6875,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00478,0.119777,0.206695,0.402778,0.194809,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
### check again

checkStats(df_data)

column:0 min:0.000000 max:1.000000 std:0.197341
column:1 min:0.000000 max:1.000000 std:0.461501
column:2 min:0.000000 max:1.000000 std:0.340448
column:3 min:0.000000 max:1.000000 std:0.123975
column:4 min:0.000000 max:1.000000 std:0.113844
column:5 min:0.000000 max:1.000000 std:0.192456
column:6 min:0.000000 max:1.000000 std:0.106077
column:7 min:0.000000 max:1.000000 std:0.118127
column:8 min:0.000000 max:1.000000 std:0.128450
column:9 min:0.000000 max:1.000000 std:0.242769
column:10 min:0.000000 max:1.000000 std:0.254232
column:11 min:0.000000 max:1.000000 std:0.113218
column:12 min:0.000000 max:1.000000 std:0.092386
column:13 min:0.000000 max:1.000000 std:0.157747
column:14 min:0.000000 max:1.000000 std:0.012079
column:15 min:0.000000 max:1.000000 std:0.217447
column:16 min:0.000000 max:1.000000 std:0.032758
column:17 min:0.000000 max:1.000000 std:0.041511
column:18 min:0.000000 max:1.000000 std:0.058607
column:19 min:0.000000 max:1.000000 std:0.088386
column:20 min:0.000000 max:1.0

In [23]:
## check integrity of the data again

print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


In [24]:
### check again
print(df_data.iloc[740:750, -1])

print(df_data.iloc[740:750, -2])

print(df_labels.iloc[740:750, -1])

3376    1
427     1
2371    1
4946    1
6683    1
5805    1
1560    1
3240    1
430     1
4696    1
Name: label_normal, dtype: uint8
3376    0
427     0
2371    0
4946    0
6683    0
5805    0
1560    0
3240    0
430     0
4696    0
Name: label_anomaly, dtype: uint8
3376    n
427     n
2371    n
4946    n
6683    n
5805    n
1560    n
3240    n
430     n
4696    n
Name: labels, dtype: object


### Save dataset

In [25]:
dataset = df_data.values

In [26]:
dataset.shape

(6853, 23)

In [27]:
np.savez_compressed("../../datasets/thyroid/thyroid.npz", dataset=dataset)

### Produce train and test splits

In [28]:
random_state = None

In [29]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(dataset, test_size=0.5, random_state=random_state)

In [30]:
x_train.shape

(3426, 23)

In [31]:
x_test.shape

(3427, 23)

-  Normal class is anomaly (attack) in this case, because non-attack rows are minority

In [32]:
x_train_normal = x_train[np.where(x_train[:,-1] == 1)]   ##last column is the normal column
x_train_anomaly = x_train[np.where(x_train[:,-1] == 0)]   

In [33]:
x_train_normal.shape

(3281, 23)

In [34]:
x_train_anomaly.shape   

(145, 23)

In [35]:
x_train = x_train_normal

- x_train consist of "normal" values

In [36]:
x_train.shape

(3281, 23)

In [37]:
x_test.shape

(3427, 23)

- Assign arbitrary output as labels

In [38]:
y_train = np.zeros((len(x_train),2))
y_train[:,0] = 1
y_train.shape

(3281, 2)

In [39]:
train_name = "../../datasets/thyroid/thyroid_train-randomState_"+str(random_state)+".npz"
test_name = "../../datasets/thyroid/thyroid_test-randomState_"+str(random_state)+".npz"

np.savez_compressed(train_name, x_train=x_train, y_train=y_train)
np.savez_compressed(test_name, x_test=x_test)