<a href="https://colab.research.google.com/github/baharababah/Botnet-Attack-Detection-Using-Machine-Learning/blob/main/IoT4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive 
drive.mount('/content/drive/')
import pandas as pd

import glob
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, recall_score, classification_report, f1_score
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense, ReLU
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from keras.layers import GRU, LSTM


def showData(data):

    '''
    This function to show the dataset
    '''
    print(data.shape)
    print(data.head())
    print("Descriptive statistics of the dataset: ",  data.describe())
    # Descriptive statistics to get the basic quantitative information about the features of our dataset.
    print("data types: ", data.dtypes)  # Data types



def dataPrepare():
    '''
       This function read the files of the dataset,
       add new column called Class that shows if the data is benign or intrusion ,
       then, it saves the files in one dataframe
       '''
    path = r'/content/drive/My Drive/fourth dataset'  # use your path
    all_files = glob.glob(path + "/*.csv")
    li = []
    keyword = 'leg'

    for filename in all_files:
        df = pd.read_csv(filename,  header=None, dtype='unicode',skiprows=2, nrows=500)
        if keyword in filename:
            df['Class'] = 0
            li.append(df)
        else:
            df['Class'] = 1
            li.append(df)

    result = pd.concat(li, ignore_index=True)
    showData(result)
    result = result.dropna()
    result.to_csv('/content/drive/My Drive/result4.csv')
    result = pd.get_dummies(result, drop_first=1)  # Transform categorical features into dummy variables
    return result


def dataPreprocessing(data):

    '''  split the dataset to test and train  '''
    x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1),
                                                        data['Class'], test_size=.4, random_state=0)
    return x_train, x_test, y_train, y_test


def feature_selection(x_train, y_train):
    ''' this function select the best 20 features for building the model using Compute chi-squared feature selection'''
    x_norm = MinMaxScaler().fit_transform(x_train)
    chi_selector = SelectKBest(chi2, k=20)
    chi_selector.fit(x_norm, y_train)
    chi_support = chi_selector.get_support()
    chi_feature = x_train.loc[:, chi_support].columns.tolist()
    print("selected features: ", chi_feature)
    chi_feature = list(chi_feature)
    return chi_feature


def compareML(x_train, x_test, y_train, y_test):
    ''' this function is to train a set of ML algorithms'''

    ''' below is to train the logistic regression algorithm and get the accuracy and confusion matrix'''
    lin_model = LogisticRegression(solver='lbfgs')
    lin_model.fit(x_train, y_train)
    print("Linear Model Accuracy: ", lin_model.score(x_test, y_test))
    lin_y_predicted = (lin_model.predict(x_test) > 0.5)
    lin_conf_mat = confusion_matrix(y_test, lin_y_predicted)
    print("Linear Model Model confusion matrix: ", lin_conf_mat)
    print("Linear Model Model TP: ", lin_conf_mat[0,0])
    print("Linear Model Model FP: ", lin_conf_mat[0,1])
    print("Linear Model Model FN: ", lin_conf_mat[1,0])
    print("Linear Model Model TN: ", lin_conf_mat[1,1])

    ''' below is to train K Nearest Neighbor algorithm and get the accuracy and confusion matrix'''
    knn_model = KNeighborsClassifier()
    knn_model.fit(x_train, y_train)
    print("K Nearest Neighbor Model Accuracy: ", knn_model.score(x_test, y_test))
    knn_y_predicted = (knn_model.predict(x_test) > 0.5)
    knn_conf_mat = confusion_matrix(y_test, knn_y_predicted)
    print("K Nearest Neighbor Model confusion matrix: ", knn_conf_mat)
    print("K Nearest Neighbor Model TP: ", knn_conf_mat[0, 0])
    print("K Nearest Neighbor Model FP: ", knn_conf_mat[0, 1])
    print("K Nearest Neighbor Model FN: ", knn_conf_mat[1, 0])
    print("K Nearest Neighbor Model TN: ", knn_conf_mat[1, 1])


    ''' below is to train Naive Bayes algorithm and get the accuracy and confusion matrix'''
    nb_model = GaussianNB()
    nb_model.fit(x_train, y_train)
    print("Naive Bayes Model Accuracy: ", nb_model.score(x_test, y_test))
    nb_y_predicted = (nb_model.predict(x_test) > 0.5)
    nb_conf_mat = confusion_matrix(y_test, nb_y_predicted)
    print("Naive Bayes Model confusion matrix: ", nb_conf_mat)
    print("Naive Bayes Model TP: ", nb_conf_mat[0,0])
    print("Naive Bayes Model FP: ", nb_conf_mat[0,1])
    print("Naive Bayes Model FN: ", nb_conf_mat[1,0])
    print("Naive Bayes Model TN: ", nb_conf_mat[1,1])

    ''' below is to train Decision Tree algorithm and get the accuracy and confusion matrix'''
    tree_model = DecisionTreeClassifier()
    tree_model.fit(x_train, y_train)
    print("Decision Tree Model Accuracy: ", tree_model.score(x_test, y_test))
    tree_y_predicted = (tree_model.predict(x_test) > 0.5)
    tree_conf_mat = confusion_matrix(y_test, tree_y_predicted)
    print("Decision Tree Model confusion matrix: ", tree_conf_mat)
    print("Decision Tree Model TP: ", tree_conf_mat[0,0])
    print("Decision Tree Model FP: ", tree_conf_mat[0,1])
    print("Decision Tree Model FN: ", tree_conf_mat[1,0])
    print("Decision Tree Model TN: ", tree_conf_mat[1,1])

    ''' below is to train Random Forest algorithm and get the accuracy and confusion matrix'''
    forest_model = RandomForestClassifier(n_estimators=100)
    forest_model.fit(x_train, y_train)
    print("Random Forest Model Accuracy: ", forest_model.score(x_test, y_test))
    forest_y_predicted = (forest_model.predict(x_test) > 0.5)
    forest_conf_mat = confusion_matrix(y_test, forest_y_predicted)
    print("Random Forest Model confusion matrix: ", forest_conf_mat)
    print("Random Forest Model TP: ", forest_conf_mat[0,0])
    print("Random Forest Model FP: ", forest_conf_mat[0,1])
    print("Random Forest Model FN: ", forest_conf_mat[1,0])
    print("Random Forest Model TN: ", forest_conf_mat[1,1])

    ''' below is to train support vector machine algorithm and get the accuracy and confusion matrix '''
    svm_model = SVC(gamma='auto')
    svm_model.fit(x_train, y_train)
    print("Support Vector Machine Model Accuracy: ", svm_model.score(x_test, y_test))
    svm_y_predicted = (svm_model.predict(x_test) > 0.5)
    svm_conf_mat = confusion_matrix(y_test, svm_y_predicted)
    print("Support Vector Machine Model confusion matrix: ", svm_conf_mat)
    print("Support Vector Machine Model TP: ", svm_conf_mat[0,0])
    print("Support Vector Machine Model FP: ", svm_conf_mat[0,1])
    print("Support Vector Machine Model FN: ", svm_conf_mat[1,0])
    print("Support Vector Machine Model TN: ", svm_conf_mat[1,1])

    ''' below is to train multi layer perceptron algorithm and get the accuracy and confusion matrix'''
    nn_model = Sequential()
    nn_model.add(Dense(9, input_shape=(20,), activation='sigmoid'))
    nn_model.add(Dense(1, activation='sigmoid'))
    nn_model.compile(optimizer='Adam', loss='binary_crossentropy')
    nn_model.fit(x_train, y_train, epochs=3, verbose=0, batch_size=5, validation_split=0.2)
    nn_y_predicted = (nn_model.predict(x_test) > 0.5)
    nn_conf_mat = confusion_matrix(y_test, nn_y_predicted)
    total = sum(sum(nn_conf_mat))
    nn_accuracy = (nn_conf_mat[0, 0] + nn_conf_mat[1, 1]) / total
    print("MLPN Model Accuracy: ", nn_accuracy)
    print("MLPN Model confusion matrix: ", nn_conf_mat)
    print("MLPN Model TP: ", nn_conf_mat[0,0])
    print("MLPN Model FP: ", nn_conf_mat[0,1])
    print("MLPN Model FN: ", nn_conf_mat[1,0])
    print("MLPN Model TN: ", nn_conf_mat[1,1])

    ''' below is to train Long short-term memory (LSTM) Neural Network algorithm and get the accuracy and confusion matrix'''
    n_steps = 20
    # split into samples

    n_features = 1
    x_train = x_train.values.reshape((x_train.shape[0], x_train.shape[1], n_features))
    x_test = x_test.values.reshape((x_test.shape[0], x_test.shape[1], n_features))
    LSTM_model = Sequential()
    LSTM_model.add(LSTM(50, activation='sigmoid', input_shape=(n_steps, n_features)))

    LSTM_model.add(Dense(1))
    LSTM_model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    # fit model
    LSTM_model.fit(x_train, y_train, epochs=3, verbose=0, batch_size=5, validation_split=0.2)
    LSTM_scores = LSTM_model.evaluate(x_train, y_train, verbose=0)
    print("%s: %.2f%%" % (LSTM_model.metrics_names[1], LSTM_scores[1] * 100))

    LSTM_y_predicted = (LSTM_model.predict(x_test) > 0.5)
    LSTM_conf_mat = confusion_matrix(y_test, LSTM_y_predicted)
    print(LSTM_conf_mat)
    total = sum(sum(LSTM_conf_mat))
    LSTM_accuracy = (LSTM_conf_mat[0, 0] + LSTM_conf_mat[1, 1]) / total
    print("LSTM Model Accuracy: ", LSTM_accuracy)
    print("LSTM Model confusion matrix: ", LSTM_conf_mat)
    print("LSTM Model TP: ", LSTM_conf_mat[0, 0])
    print("LSTM Model FP: ", LSTM_conf_mat[0, 1])
    print("LSTM Model FN: ", LSTM_conf_mat[1, 0])
    print("LSTM Model TN: ", LSTM_conf_mat[1, 1])



''' call the functions '''
result = dataPrepare()
x_train, x_test, y_train, y_test = dataPreprocessing(result)
feat_cols = feature_selection(x_train, y_train)
compareML(x_train[x_train.columns.intersection(feat_cols)], x_test[x_train.columns.intersection(feat_cols)], y_train, y_test)






Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
(6000, 101)
                    0                  1                   2  ...   98   99 Class
0                 2.0               74.0                 0.0  ...  0.0  0.0     1
1                 1.0               74.0                 0.0  ...  0.0  0.0     1
2                 2.0               74.0                 0.0  ...  0.0  0.0     1
3  2.9730265057629177  71.30913936202964   14.28615413078387  ...  0.0  0.0     1
4  3.9730265057629177  69.97284337851967  15.999262517908392  ...  0.0  0.0     1

[5 rows x 101 columns]
Descriptive statistics of the dataset:               Class
count  6000.000000
mean      0.666667
std       0.471444
min       0.000000
25%       0.000000
50%       1.000000
75%       1.000000
max       1.000000
data types:  0        object
1        object
2        object
3        object
4        object
          ...  
96       object
97   