In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import datetime
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.utils import shuffle
from pandas.plotting import scatter_matrix


ioTInTData = pd.read_csv("data/IoTID20/IoT_Network_Intrusion_Dataset_Category.csv" , low_memory=False)
#ioTInTData = ioTInTData[ioTInTData['Cat'] != 'Mirai']
ioTInTData = ioTInTData.iloc[:,:81]
ioTInTData = ioTInTData.drop(['Label'],axis=1) 
print('ioTInTData', ioTInTData.shape)

#ioTInTData = ioTInTData.fillna(np.nan, inplace=True)
ioTInTData.replace([np.inf, -np.inf], np.nan, inplace=True)
ioTInTData.dropna(inplace=True)
ioTInTData.fillna(0)
ioTInTData.shape




def readDataSets():
    dataSetFridge = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/cat/IoT_Fridge.csv')
    dataSetGarageDoor = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/cat/IoT_Garage_Door.csv')
    dataSetGPS = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/cat/IoT_GPS_Tracker.csv')
    dataSetModbus = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/cat/IoT_Modbus.csv')
    dataSetMotionLight = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/cat/IoT_Motion_Light.csv')
    dataSetThermostat = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/cat/IoT_Thermostat.csv')
    dataSetWeahter = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/cat/IoT_Weather.csv')

    dataSetFridge['temp_condition'] = dataSetFridge['temp_condition'].str.strip()
    dataSetGarageDoor['door_state'] = dataSetGarageDoor['door_state'].str.strip()
    dataSetMotionLight['light_status'] = dataSetMotionLight['light_status'].str.strip()
    dataSetRawLoad = pd.concat([dataSetFridge, dataSetGarageDoor, dataSetGPS, dataSetModbus, dataSetMotionLight, dataSetThermostat, dataSetWeahter])
    
    dataSetRawLoad = dataSetRawLoad[dataSetRawLoad['type'] != 'xss']
    dataSetRawLoad = dataSetRawLoad[dataSetRawLoad['type'] != 'normal']
    dataSetRawLoad = dataSetRawLoad[dataSetRawLoad['type'] != 'ransomware']
    dataSetRawLoad = dataSetRawLoad[dataSetRawLoad['type'] != 'injection']
    dataSetRawLoad = dataSetRawLoad[dataSetRawLoad['type'] != 'backdoor']

    return dataSetRawLoad

In [None]:
ioTInTData.head(5)

In [None]:
df = readDataSets()
print(df.shape)
df

In [None]:
# Load the data
categorical_features = ['door_state','sphone_signal', 'light_status','temp_condition']
quantitative_features = ['FC1_Read_Input_Register','FC2_Read_Discrete_Value','FC3_Read_Holding_Register','FC4_Read_Coil','current_temperature',
                        'fridge_temperature','humidity','latitude','FC4_Read_Coil','longitude',
                        'motion_status','pressure','temperature','thermostat_status']
features = categorical_features + quantitative_features


def datapreprocessingShuffle(data):
    scaler = StandardScaler()            
    # Feature scaling
    for i in data.columns:
        if(data[i].name !='Cat'):
            data[i] = scaler.fit_transform(data[[i]])
        else:
            data[i]=data[i]
            
    data = shuffle(data).reset_index(drop=True) 
    return data


def datapreprocessingShuffle2(data):
               
    # Feature scaling
    for i in quantitative_features :
            scaler = StandardScaler()
            data[i] = scaler.fit_transform(data[[i]])
            
    # Encoding categorical features    
    for i in categorical_features :
        labelencoder=LabelEncoder()
        data[i]=labelencoder.fit_transform(data[i])   
    
    data = shuffle(data).reset_index(drop=True) 
    return data

In [None]:
# Pre-processing datset
datacopy1 = ioTInTData.copy()
datacopy2 = df.copy()
datacopy1['Cat'] = datacopy1.iloc[:,79:80].replace('DoS', '0').replace('Scan', '1').replace('MITM ARP Spoofing', '2').astype('int')
data1 = datapreprocessingShuffle(datacopy1)

data2 = datacopy2.rename(columns={"type": "Cat"})
data2['Cat'] = data2.iloc[:,17:18].replace('ddos', '0').replace('scanning', '1').replace('password', '3').astype('int')
data2 = datapreprocessingShuffle2(data2) 
print(data1.shape)
print(data2.shape)

In [None]:
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, Concatenate
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.initializers import RandomNormal
import tensorflow.keras.backend as K
from sklearn.utils import shuffle

class cGAN():
    def __init__(self):
        self.latent_dim = 120
        self.out_shape = 120
        self.num_classes = 2
        self.clip_value = 0.01
        optimizer = Adam(0.0002, 0.5)
        #optimizer = RMSprop(lr=0.00005)

        # build discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss=['binary_crossentropy'],
                                   optimizer=optimizer,
                                   metrics=['accuracy'])

        # build generator
        self.generator = self.build_generator()

        # generating new data samples
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,))
        gen_samples = self.generator([noise, label])

        self.discriminator.trainable = False

        # passing gen samples through disc. 
        valid = self.discriminator([gen_samples, label])

        # combining both models
        self.combined = Model([noise, label], valid)
        self.combined.compile(loss=['binary_crossentropy'],
                              optimizer=optimizer,
                             metrics=['accuracy'])
        self.combined.summary()

    def wasserstein_loss(self, y_true, y_pred):
        return K.mean(y_true * y_pred)

    def build_generator(self):
        init = RandomNormal(mean=0.0, stddev=0.02)
        model = Sequential()

        model.add(Dense(128, input_dim=self.latent_dim))
        #model.add(Dropout(0.2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))

        model.add(Dense(256))
        #model.add(Dropout(0.2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))

        model.add(Dense(512))
        #model.add(Dropout(0.2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))

        model.add(Dense(self.out_shape, activation='tanh'))
        model.summary()

        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
        
        model_input = multiply([noise, label_embedding])
        gen_sample = model(model_input)

        return Model([noise, label], gen_sample, name="Generator")

    
    def build_discriminator(self):
        init = RandomNormal(mean=0.0, stddev=0.02)
        model = Sequential()

        model.add(Dense(512, input_dim=self.out_shape, kernel_initializer=init))
        model.add(LeakyReLU(alpha=0.2))
        
        model.add(Dense(256, kernel_initializer=init))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.4))
        
        model.add(Dense(128, kernel_initializer=init))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.4))
        
        model.add(Dense(1, activation='sigmoid'))
        model.summary()
        
        gen_sample = Input(shape=(self.out_shape,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.out_shape)(label))

        model_input = multiply([gen_sample, label_embedding])
        validity = model(model_input)

        return Model(inputs=[gen_sample, label], outputs=validity, name="Discriminator")


    def train(self, X_train, y_train, pos_index, neg_index, epochs, batch_size=32, sample_interval=50):

        # Adversarial ground truths
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):
            
            #  Train Discriminator with 8 sample from postivite class and rest with negative class
            idx1 = np.random.choice(pos_index, 8)
            idx0 = np.random.choice(neg_index, batch_size-8)
            idx = np.concatenate((idx1, idx0))
            samples, labels = X_train[idx], y_train[idx]
            samples, labels = shuffle(samples, labels)
            # Sample noise as generator input
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

            # Generate a half batch of new images
            gen_samples = self.generator.predict([noise, labels])

            # label smoothing
            if epoch < epochs//1.5:
                valid_smooth = (valid+0.1)-(np.random.random(valid.shape)*0.1)
                fake_smooth = (fake-0.1)+(np.random.random(fake.shape)*0.1)
            else:
                valid_smooth = valid 
                fake_smooth = fake
                
            # Train the discriminator
            self.discriminator.trainable = True
            d_loss_real = self.discriminator.train_on_batch([samples, labels], valid_smooth)
            d_loss_fake = self.discriminator.train_on_batch([gen_samples, labels], fake_smooth)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            # Condition on labels
            self.discriminator.trainable = False
            sampled_labels = np.random.randint(0, 2, batch_size).reshape(-1, 1)
            # Train the generator
            g_loss = self.combined.train_on_batch([noise, sampled_labels], valid)

            # Plot the progress
            if (epoch+1)%sample_interval==0:
                print (f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}] [G loss: {g_loss}]")

In [43]:
data2 = data2.copy()
data2_xtrain = data2.drop("Cat",1)
data2_ytrain = data2['Cat']
data2_ytrain = np.array(data2_ytrain)
data2_ytrain = data2_ytrain.reshape(-1,1)
pos_index = np.where(data2_ytrain==1)[0]
neg_index = np.where(data2_ytrain==0)[0]

#(92172, 80)
#(63973, 18)

In [23]:
cgan.train(data2_xtrain, data2_ytrain, pos_index, neg_index, epochs=1000)

In [30]:
# generating new samples
noise = np.random.normal(0, 1, (63973, 18))
sampled_labels = np.ones(63973).reshape(-1, 1)

gen_samples = cgan.generator.predict([noise, sampled_labels])
gen_samples = x_scaler.inverse_transform(gen_samples)
print(gen_samples.shape)

In [None]:
# fill na with mean for ToN_IoT dataset
data2 = data2.fillna(data2.mean())

In [None]:
# fill na with mean for ToN_IoT dataset
fData = pd.concat([data1, data2])
fData = fData.fillna(fData.mean())
fData.shape

In [7]:
pd.DataFrame(fData['Cat'].value_counts())[:]

In [None]:
pd.DataFrame(fData['Cat'].value_counts())[:]
#matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
carrier_count = fData['Cat'].value_counts()
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)
plt.title('Frequency Distribution of Cat')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Cat', fontsize=12)
plt.show()

In [None]:
y = fData['Cat']
X = fData.drop(['Cat'],axis=1) 

print(y.shape)
print(X.shape)

from sklearn.preprocessing import MinMaxScaler
# on this distribution. 
sc = MinMaxScaler()
X_std =  sc.fit_transform(X)

cov_matrix = np.cov(X_std.T)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
# Make a set of (eigenvalue, eigenvector) pairs:
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
#eig_pairs.sort()
eig_pairs.reverse()
#print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
#print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)

tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)]  # an array of variance explained by each 
# eigen vector... there will be 18 entries as there are 18 eigen vectors)
cum_var_exp = np.cumsum(var_explained)  # an array of cumulative variance. There will be 18 entries with 18 th entry 
# cumulative reaching almost 100%

plt.bar(range(1,97), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,97),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2

# prepare target
def prepare_targets(input):
    le = LabelEncoder()
    le.fit(input)
    value = le.transform(input)
    return value

# feature selection
def select_featuresMutual(X_train, y_train, X_test):
    fs = SelectKBest(score_func=mutual_info_classif, k=4)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs

# feature selection
def select_featuresChi(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k=4)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs

# feature selection
def select_featuresPCA(X_train, y_train, X_test, n_comp):
    fs = PCA(n_components=n_comp)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs

In [None]:
# get a voting ensemble of models
def get_voting():
    # define the base models
    models = list()
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeRegressor()))
    models.append(('NB', GaussianNB()))
    models.append(('MLP', MLPClassifier()))
    models.append(('DT', DecisionTreeClassifier()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('LR', LogisticRegression()))
    models.append(('SVM', svm.SVC()))
    models.append(('AdaBoost', AdaBoostClassifier()))
    models.append(('GradientBoosting', GradientBoostingClassifier()))
    models.append(('XGB', XGBClassifier()))
    # define the voting ensemble
    ensemble = VotingClassifier(estimators=models, voting='soft')
    return ensemble

# get a list of models to evaluate
def get_models():
    models = list()
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeRegressor()))
    models.append(('NB', GaussianNB()))
    models.append(('MLP', MLPClassifier()))
    models.append(('DT', DecisionTreeClassifier()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('LR', LogisticRegression()))
    models.append(('SVM', svm.SVC()))
    models.append(('AdaBoost', AdaBoostClassifier()))
    models.append(('GradientBoosting', GradientBoostingClassifier()))
    models.append(('XGB', XGBClassifier()))
    models.append(('soft_voting', get_voting()))
    return models

def evaluate_model(model, X, y):
    cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models:
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
# evaluation of a model fit using mutual information input features
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Spot Check Algorithms
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('MLP', MLPClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('SVM', svm.SVC()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('GradientBoosting', GradientBoostingClassifier()))
models.append(('XGB', XGBClassifier()))

for name, model in models:
    print(model)
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    result = next(kfold.split(X,y), None)
    x_train = X.iloc[result[0]]
    x_test = X.iloc[result[1]]
    y_train = y.iloc[result[0]]
    y_test = y.iloc[result[1]]
    # feature selection
    X_train_fs, X_test_fs = select_featuresMutual(x_train, y_train, x_test)
    model.fit(X_train_fs, y_train)
    # evaluate the model
    y_pred = model.predict(X_test_fs)
    print('Mutual', metrics.classification_report(y_test, y_pred, target_names=['DoS', 'scanning', 'MITM ARP Spoofing', 'password']))
    #
    #PCA
    # feature selection
    X_train_fs, X_test_fs = select_featuresPCA(x_train, y_train, x_test, 20)
    model.fit(X_train_fs, y_train)
    # evaluate the model
    y_pred = model.predict(X_test_fs)
    print('PCA', metrics.classification_report(y_test, y_pred, target_names=['DoS', 'scanning', 'MITM ARP Spoofing', 'password']))