In [24]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import datetime
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.utils import shuffle

def readDataSets():
    dataSetFridge = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/binary/IoT_Fridge.csv')
    dataSetGarageDoor = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/binary/IoT_Garage_Door.csv')
    dataSetGPS = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/binary/IoT_GPS_Tracker.csv')
    dataSetModbus = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/binary/IoT_Modbus.csv')
    dataSetMotionLight = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/binary/IoT_Motion_Light.csv')
    dataSetThermostat = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/binary/IoT_Thermostat.csv')
    dataSetWeahter = pd.read_csv(filepath_or_buffer = 'data/ToNIoT/binary/IoT_Weather.csv')

    dataSetFridge['temp_condition'] = dataSetFridge['temp_condition'].str.strip()
    dataSetGarageDoor['door_state'] = dataSetGarageDoor['door_state'].str.strip()
    dataSetMotionLight['light_status'] = dataSetMotionLight['light_status'].str.strip()
    dataSetRawLoad = pd.concat([dataSetFridge, dataSetGarageDoor, dataSetGPS, dataSetModbus, dataSetMotionLight, dataSetThermostat, dataSetWeahter])
    
    dataSetRawLoad1=pd.DataFrame()
    label=LabelEncoder()
    for c in  dataSetRawLoad.columns:
        if(dataSetRawLoad[c].dtype=='object'):
            dataSetRawLoad1[c]=label.fit_transform(dataSetRawLoad[c])
        else:
            dataSetRawLoad1[c]=dataSetRawLoad[c]

    print('dataSetRawLoad: ', dataSetRawLoad1.shape)
    return dataSetRawLoad1

In [None]:
df = readDataSets()
print(df.shape)
print(df.info())
df.head()

a = pd.DataFrame(df['label'].value_counts())[:]
a.plot(kind='pie', subplots=True, figsize=(5, 5))
plt.title('ToN-IoT Dataset Attacks')
plt.legend(loc='left')
plt.show()

In [None]:
pd.DataFrame(df['label'].value_counts())[:]

In [None]:
# Load the data
categorical_features = ['label', 'door_state','sphone_signal', 'light_status','temp_condition']
quantitative_features = ['FC1_Read_Input_Register','FC2_Read_Discrete_Value','FC3_Read_Holding_Register','FC4_Read_Coil','current_temperature',
                        'fridge_temperature','humidity','latitude','FC4_Read_Coil','longitude',
                        'motion_status','pressure','temperature','thermostat_status']
features = categorical_features + quantitative_features

def datapreprocessingShuffle(data):
               
    # Feature scaling
    for i in quantitative_features :
            scaler = StandardScaler()
            data[i] = scaler.fit_transform(data[[i]])
            
    # Encoding categorical features    
    for i in categorical_features : 
        labelencoder=LabelEncoder()
        data[i]=labelencoder.fit_transform(data[i])   
    
    data = shuffle(data).reset_index(drop=True) 
    
    Y = data.loc[:,'label']
    X = data.drop(['label'],axis=1) 
    
    return(X,Y)

In [27]:
# Pre-processing datset
datacopy = df.copy()
X, y = datapreprocessingShuffle(datacopy) 

X = X.fillna(X.mean())

In [18]:
from sklearn import preprocessing
normalized_arr = preprocessing.normalize(X)
normalized_arr

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from matplotlib import pyplot
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
import time

In [None]:
# get a voting ensemble of models
def get_voting():
    # define the base models
    models = list()
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeRegressor()))
    models.append(('NB', GaussianNB()))
    models.append(('MLP', MLPClassifier()))
    models.append(('DT', DecisionTreeClassifier()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('LR', LogisticRegression()))
    models.append(('SVM', svm.SVC()))
    models.append(('AdaBoost', AdaBoostClassifier()))
    models.append(('GradientBoosting', GradientBoostingClassifier()))
    models.append(('XGB', XGBClassifier()))
    # define the voting ensemble
    ensemble = VotingClassifier(estimators=models, voting='soft')
    return ensemble

# get a list of models to evaluate
def get_models():
    models = list()
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeRegressor()))
    models.append(('NB', GaussianNB()))
    models.append(('MLP', MLPClassifier()))
    models.append(('DT', DecisionTreeClassifier()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('LR', LogisticRegression()))
    models.append(('SVM', svm.SVC()))
    models.append(('AdaBoost', AdaBoostClassifier()))
    models.append(('GradientBoosting', GradientBoostingClassifier()))
    models.append(('XGB', XGBClassifier()))
    models.append(('soft_voting', get_voting()))
    return models

def evaluate_model(model, X, y):
    cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models:
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import xgboost as xgb

# Spot Check Algorithms
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('MLP', MLPClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('GradientBoosting', GradientBoostingClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('SVM', svm.SVC()))


# evaluate each model in turn
results = []
names = []
for name, model in models:
    print('asd')
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [None]:
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('MLP', MLPClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('GradientBoosting', GradientBoostingClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('SVM', svm.SVC()))


scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)
           #,'auc-score' : make_scorer(roc_auc_score)
          }

# evaluate each model in turn
results = []
names = []
for name, model in models:
    start = time.time()
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_validate(model, normalized_arr, y, cv=kfold, scoring=scoring)
    end = time.time()
    results.append(cv_results)
    names.append(name)
    #print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    print(name)
    print(end - start, "seconds")
    #print(cv_results)
    print(np.mean(cv_results['test_accuracy']))
    print(np.mean(cv_results['test_precision']))
    print(np.mean(cv_results['test_recall']))
    print(np.mean(cv_results['test_f1_score']))
    #print(np.mean(cv_results['test_auc-score']))

In [None]:
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('MLP', MLPClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('GradientBoosting', GradientBoostingClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('SVM', svm.SVC()))

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)
           #,'auc-score' : make_scorer(roc_auc_score)
          }

# evaluate each model in turn
results = []
names = []
for name, model in models:
    start = time.time()
    kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    cv_results = cross_validate(model, normalized_arr, y, cv=kfold, scoring=scoring)
    end = time.time()
    results.append(cv_results)
    names.append(name)
    #print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    print(name)
    print(end - start, "seconds")
    #print(cv_results)
    print(np.mean(cv_results['test_accuracy']))
    print(np.mean(cv_results['test_precision']))
    print(np.mean(cv_results['test_recall']))
    print(np.mean(cv_results['test_f1_score']))
    #print(np.mean(cv_results['test_auc-score']))

In [None]:
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('MLP', MLPClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('GradientBoosting', GradientBoostingClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('SVM', svm.SVC()))


scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_validate(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    #print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    print(name)
    #print(cv_results)
    print(np.mean(cv_results['test_accuracy']))
    print(np.mean(cv_results['test_precision']))
    print(np.mean(cv_results['test_recall']))
    print(np.mean(cv_results['test_f1_score']))