In [1]:
import os
from google.colab import userdata
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
!pip install kaggle



In [2]:
os.environ["KAGGLE_USERNAME"] = userdata.get("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = userdata.get("KAGGLE_API_TOKEN")

In [3]:
!kaggle datasets download solarmainframe/ids-intrusion-csv

Dataset URL: https://www.kaggle.com/datasets/solarmainframe/ids-intrusion-csv
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading ids-intrusion-csv.zip to /content
100% 1.60G/1.60G [00:25<00:00, 37.8MB/s]
100% 1.60G/1.60G [00:25<00:00, 67.3MB/s]


In [4]:
!unzip ids-intrusion-csv.zip -d ids-datasets

Archive:  ids-intrusion-csv.zip
  inflating: ids-datasets/02-14-2018.csv  
  inflating: ids-datasets/02-15-2018.csv  
  inflating: ids-datasets/02-16-2018.csv  
  inflating: ids-datasets/02-20-2018.csv  
  inflating: ids-datasets/02-21-2018.csv  
  inflating: ids-datasets/02-22-2018.csv  
  inflating: ids-datasets/02-23-2018.csv  
  inflating: ids-datasets/02-28-2018.csv  
  inflating: ids-datasets/03-01-2018.csv  
  inflating: ids-datasets/03-02-2018.csv  


In [5]:
df = pd.read_csv("ids-datasets/02-14-2018.csv")

In [6]:
cleaned_data = df.dropna()


In [7]:
label_encoder = LabelEncoder()

cleaned_data['Label'] = label_encoder.fit_transform(cleaned_data['Label'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['Label'] = label_encoder.fit_transform(cleaned_data['Label'])


In [8]:
cleaned_data['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,665355
1,193354
2,187589


In [9]:
data_1 = cleaned_data[cleaned_data['Label'] == 0]
data_2 = cleaned_data[cleaned_data['Label'] == 1]
data_3 = cleaned_data[cleaned_data['Label'] == 2]

y_1 = np.zeros(data_1.shape[0])
y_benign = pd.DataFrame(y_1)


y_2 = np.ones(data_2.shape[0])
y_bf = pd.DataFrame(y_2)


y_3 = np.full(data_3.shape[0], 2)
y_ssh = pd.DataFrame(y_3)


X = pd.concat([data_1, data_2, data_3], sort=True)
y = pd.concat([y_benign, y_bf, y_ssh], sort=True)


In [10]:
X.isnull().sum().to_numpy()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Resampling data to make it proportionate

In [11]:
data_1_resample = resample(data_1, n_samples=20000, random_state=123, replace= True)
data_2_resample = resample(data_2, n_samples=20000, random_state=123, replace= True)
data_3_resample = resample(data_3, n_samples=20000, random_state=123, replace= True)

In [12]:
dataset = pd.concat([data_1_resample, data_2_resample, data_3_resample])

In [13]:
import re
def normalize_col(col):
    col = col.lower()
    col = col.replace('/', '_')
    col = col.replace(' ', '_')
    col = re.sub(r'[^a-z0-9_]', '', col)
    col = re.sub(r'_+', '_', col)
    return col.strip('_')

dataset.columns = dataset.columns.map(normalize_col)

In [14]:
X = dataset.drop(columns=["label"])
y = dataset["label"]

In [15]:
X = X.drop(columns=["timestamp", "protocol",
                                      "psh_flag_cnt","init_fwd_win_byts","flow_byts_s","flow_pkts_s",
                                      ], axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.1, random_state=123)

In [17]:
y_train.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,18073
2,18024
1,17903


Creating Labels One-Hot-Encoding

In [18]:
!pip install tensorflow
import keras
from tensorflow.keras.utils import to_categorical



In [19]:
y_train = to_categorical(y_train)

In [20]:
y_train

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [21]:
y_test = to_categorical(y_test)

In [22]:
X_test = X_test.to_numpy()
X_train = X_train.to_numpy()

In [23]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(54000, 73)
(6000, 73)
(54000, 3)
(6000, 3)


In [24]:
X_train = X_train.reshape(len(X_train), X_train.shape[1], 1)
X_test = X_test.reshape(len(X_test), X_test.shape[1], 1)
X_train.shape, X_test.shape

((54000, 73, 1), (6000, 73, 1))

In [25]:
!pip install tensorflow
from keras.models import Sequential
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.layers import Conv2D, Conv1D, MaxPooling2D, MaxPooling1D, Flatten, BatchNormalization, Dense

# from tensorflow.keras



In [35]:
def model():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=6, activation='relu',
                    padding='same', input_shape=(73, 1)))
    model.add(BatchNormalization())

    # MaxPolling to reduce dimension
    model.add(MaxPooling1D(pool_size=(3), strides=2, padding='valid'))

    model.add(Conv1D(filters=64, kernel_size=6, activation='relu',
                    padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=(3), strides=2, padding='valid'))

    model.add(Conv1D(filters=64, kernel_size=6, activation='relu',
                    padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=(3), strides=2, padding='valid'))

    # model.add(Conv1D(filters=64, kernel_size=6, activation='relu',
    #                 padding='same'))
    # model.add(BatchNormalization())
    # model.add(MaxPooling1D(pool_size=(3), strides=2, padding='valid'))

    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [36]:
model = model()
model.summary()

In [37]:
logger = CSVLogger('logs.csv', append=True)
his = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[logger])

Epoch 1/50
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 6ms/step - accuracy: 0.9189 - loss: 0.2188 - val_accuracy: 0.5200 - val_loss: 1.1998
Epoch 2/50
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - accuracy: 0.9737 - loss: 0.0852 - val_accuracy: 0.6277 - val_loss: 0.6604
Epoch 3/50
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9737 - loss: 0.0883 - val_accuracy: 0.5273 - val_loss: 0.8785
Epoch 4/50
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9737 - loss: 0.0863 - val_accuracy: 0.7595 - val_loss: 0.7953
Epoch 5/50
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9769 - loss: 0.0739 - val_accuracy: 0.7622 - val_loss: 0.8211
Epoch 6/50
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9768 - loss: 0.0724 - val_accuracy: 0.6017 - val_loss: 0.9076
Epoch 7/50
[1

In [38]:
scores = model.evaluate(X_test, y_test)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8280 - loss: 0.4570
compile_metrics: 82.73%
