# Fuzziness based semi-supervised learning approach for intrusion detection system

## Imports

In [3]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn import preprocessing
import numpy as np
from math import log

from utils import load_data

## Loading and Data Preprocessing

In [4]:
datasets = load_data('20 Percent Training Set.csv', 'KDDTest+.csv')

train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]

lb = preprocessing.LabelBinarizer()

train_set_y = lb.fit_transform(train_set_y)
valid_set_y = lb.transform(valid_set_y)
test_set_y = lb.transform(test_set_y)

print(train_set_x.shape)
print(valid_set_x.shape)

(2519, 41)
(22673, 41)


## Model

### Train NNR using labelled data

In [5]:
def train_classifier(X, y):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=41, activation='sigmoid'))
    model.add(Dense(5, activation='softmax'))

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

    # Fit the model
    model.fit(X, y)
    
    return model

print(train_set_x.shape)
print(train_set_y.shape)

model = train_classifier(train_set_x, train_set_y)

score = model.evaluate(test_set_x, test_set_y)

print("Accuracy: ", score[1])

(2519, 41)
(2519, 5)
Epoch 1/1
Accuracy:  0.833065695667


### Use the classifier to get the membership vector of each unlabelled sample

In [6]:
membershipVectors = model.predict(valid_set_x) # Membership Matrix

binarizedLabels = (membershipVectors == membershipVectors.max(axis=1, keepdims=1)).astype(float)

In [7]:
print(test_set_y)

[[1 0 0 0 0]
 [1 0 0 0 0]
 [0 1 0 0 0]
 ..., 
 [1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]]


### Calculate Fuzziness using the following equation:
$F(V)=-\frac{1}{n}\sum_{i=1}^{n}(\mu_{i}\log\mu_{i} + (1 - \mu_{i})\log(1 - \mu_{i}))$

In [8]:
def F(V):
    def inner(mu):
        return (mu * log(mu, 2)) + ((1 - mu) * log(1 - mu, 2))
    return - np.mean(list(map(inner, V)))

fuzziness = np.array(list(map(F, membershipVectors)))

print(fuzziness[0])

0.529316712226


### Add samples with low and high fuzziness to the training set

In [9]:
lowFuzzinessIndices = np.append(np.where( fuzziness >= 5/6), np.where( fuzziness <= 1/6 ) )
highFuzzinessIndices = np.logical_and(fuzziness >= 2/6, fuzziness <= 2/3)

# Fuzziness values >= 5/6
lowFuzzinessGroup = valid_set_x[ lowFuzzinessIndices ]
lowFuzzinessLabels = binarizedLabels[ lowFuzzinessIndices ]

# Fuzziness values 2/6 <= x <= 2/3
highFuzzinessGroup = valid_set_x[ highFuzzinessIndices ]
highFuzzinessLabels = binarizedLabels [ highFuzzinessIndices ]

# Append new samples to training set
train_set_x = np.concatenate((train_set_x, lowFuzzinessGroup, highFuzzinessGroup), axis=0)

train_set_y = np.concatenate((train_set_y, lowFuzzinessLabels, highFuzzinessLabels), axis=0)

### Retrain the classifier with the new training set

In [10]:
print(train_set_x.shape)
print(train_set_y.shape)
model = train_classifier(train_set_x, train_set_y)

(22135, 41)
(22135, 5)
Epoch 1/1


## Evaluation

In [11]:
score = model.evaluate(test_set_x, test_set_y)

print("Accuracy: ", score[1])

Accuracy:  0.839426866865
