<a href="https://colab.research.google.com/github/asgardian1196/asg-ml/blob/main/Network_Intrusion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Network intrusion detection is a crucial aspect of cybersecurity, as it helps to identify and prevent unauthorized access to a computer network. This code provides a solution for detecting intrusions by analyzing network traffic and identifying patterns that indicate malicious activity. It uses a combination of signature-based and anomaly-based detection techniques to identify known and unknown threats.

This code is implemented using python and makes use of sklearn xgboost neural networks to analyze network traffic and detect intrusions. It can be easily integrated with existing network infrastructure and can be configured to suit the specific needs of the network.

The code is well-documented and easy to understand, making it suitable for both experienced and novice users. It also includes a user-friendly interface that allows for the easy configuration and management of the intrusion detection system.

Overall, this code provides a powerful and reliable solution for protecting computer networks against intrusions and other malicious activities. It is a valuable addition to any organization's cybersecurity arsenal, and can help to identify and prevent threats before they cause damage.






In [None]:
#Base
import pandas as pd
import numpy as np
import os
import datetime

# Data preparation
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.decomposition import TruncatedSVD

In [None]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.4.1-py3-none-any.whl (27 kB)
Collecting importlib-metadata<4,>=3
  Downloading importlib_metadata-3.10.1-py3-none-any.whl (14 kB)
Installing collected packages: importlib-metadata, scikeras
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 4.8.2
    Uninstalling importlib-metadata-4.8.2:
      Successfully uninstalled importlib-metadata-4.8.2
Successfully installed importlib-metadata-3.10.1 scikeras-0.4.1


In [None]:
#Models
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import Dense
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier

In [None]:
target_col = ["is_nw_intruded"]

In [None]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(f"The shape of Training data is {data.shape}")
print(f"The shape of Testing data is {test.shape}")

data = data.dropna()
target = data[target_col]

data_X = data.drop(target_col, axis=1)
print(f"The shape of Training data features is {data_X.shape}")
data_X.head()

The shape of Training data is (80000, 52)
The shape of Testing data is (20000, 51)
The shape of Training data features is (80000, 51)


Unnamed: 0,packet_id,seq_no,ip_add_src,ip_add_dest,src_inpk_at,dest_inpk_at,src_dest_ttl,starttime,lasttime,lstm_cxn_sport,lstm_cxn_dport,total_rec_time,src_ad_val,dest_ad_val,cxn_service_src,cxn_service_dest,lstm_cxn_src,lstm_cxn_dest,src_ttl,dest_ttl,src_bsn,dest_bsn,http_depth,src_pkt_count,dest_pkt_count,trans_src_bytes,dest_src_bytes,trans_protocol,src_bits,dest_bits,dep_protocol,src_dropped,dest_dropped,src_pkt_size,dest_pkt_size,src_jitter,dest_jitter,http_data_trans,src_dst_ip,cxn_rtt,pkack_cxn_rtt,pksyn_cxn_rtt,trans_type,ftp_pswd_flow,lstm_cxn_count,req_method,speed,ftp_access,rnd,port_dest,port_src
0,1,81722,60.2.170.234,230.158.83.77,0.009,0.0,2,10:43:40,10:43:49,6,6,9e-06,0,0,22,22,6,6,254,0,0,0,0,2,0,114,0,udp,50666660.0,0.0,INT,0,0,57,0,0.0,0.0,0,0,0.0,0.0,0.0,dns,0,22,0,111111.1072,0,13905,64382,54834
1,2,83673,79.179.114.175,100.63.1.11,85.161507,18.947174,1,17:25:08,17:25:25,1,1,4.002591,255,255,1,1,1,1,62,252,1224718414,3666839597,0,48,208,2302,253561,tcp,4507.081,504357.2813,FIN,6,102,48,1219,6004.057264,2968.23818,0,0,0.116756,0.05652,0.060236,-,0,1,0,63.708731,0,2366,56998,54907
2,3,60486,78.219.187.155,103.152.152.4,0.005,0.0,2,6:25:47,6:25:47,1,1,5e-06,0,0,3,1,1,1,254,0,0,0,0,2,0,494,0,udp,395200000.0,0.0,INT,0,0,247,0,0.0,0.0,0,0,0.0,0.0,0.0,-,0,1,0,200000.0051,0,2682,57405,53966
3,4,16701,213.77.245.114,121.246.173.153,90.890543,80.888273,1,19:12:45,19:12:49,1,1,0.999796,255,255,2,2,3,1,62,252,1607997394,3339783268,1,12,12,1262,3234,tcp,9257.889,23724.83984,FIN,3,3,105,270,5156.094532,144.096828,761,0,0.261904,0.153774,0.10813,http,0,1,1,23.004694,0,13608,53962,64901
4,5,80069,62.148.203.79,242.88.4.212,92.1978,0.001,3,14:04:20,14:04:23,1,2,0.460989,255,255,7,4,3,2,62,252,1024827879,3673219255,0,6,2,1012,86,tcp,14646.77,746.221741,CON,2,1,169,43,5109.490527,0.0,0,0,0.340783,0.174539,0.166244,-,0,4,0,15.184744,0,12481,49609,62846


In [None]:
def starttime_calc (row):
    format = '%H:%M:%S'
    ftr = [3600,60,1]
    if ":" in row['starttime'] :
        x =  str(datetime.datetime.strptime(row['starttime'], format)).split(" ")[1]
        a = sum([a*b for a,b in zip(ftr, map(int,str(x).split(':')))])
        return int(a)
    elif "." in row['starttime']:
        if len(row['starttime']) < 11:
            nt = row['starttime'] + "0"*(11-len(row['starttime']))
            dat = int(nt.split(".")[1])
            return int(dat/11574)
        
        dat = int(row['starttime'].split(".")[1])
        return int(dat/11574)
    else:
    # return row['starttime']
        return 0

def lasttime_calc (row):
    format = '%H:%M:%S'
    ftr = [3600,60,1]

    if ":" in row['lasttime'] :
        x =  str(datetime.datetime.strptime(row['lasttime'], format)).split(" ")[1]
        a = sum([a*b for a,b in zip(ftr, map(int,str(x).split(':')))])
        return int(a)
    elif "." in row['lasttime']:
        if len(row['lasttime']) < 11:
            nt = row['lasttime'] + "0"*(11-len(row['lasttime']))
            dat = int(nt.split(".")[1])
            return int(dat/11574)
        
        dat = int(row['lasttime'].split(".")[1])
        return int(dat/11574)
    else:
        return 0

def duration (row):
    l = int(row['last_sec'])
    s = int(row['start_sec'])
    return int(l-s)

def n_length(x):
    if "." in x:
        if len(x) < 11:
            return x + "0"*(11-len(x))
        else:
            return x

In [None]:
data_X["start_sec"] = data_X.apply (lambda row: starttime_calc(row), axis=1)
data_X["last_sec"] = data_X.apply (lambda row: lasttime_calc(row), axis=1)
data_X["total_time"] = data_X.apply (lambda row: duration(row), axis=1)

test["start_sec"] = test.apply (lambda row: starttime_calc(row), axis=1)
test["last_sec"] = test.apply (lambda row: lasttime_calc(row), axis=1)
test["total_time"] = test.apply (lambda row: duration(row), axis=1)

In [None]:
dummy_fields = ["trans_protocol", "dep_protocol", "trans_type"]
fields_to_drop = ["packet_id", "seq_no", "src_bsn", "dest_bsn", "port_dest", "port_src", 
                   "ip_add_src", "ip_add_dest", "starttime", "lasttime","start_sec", "last_sec"]

data_X = pd.get_dummies(data=data_X, columns=dummy_fields)
data_X = data_X.drop(fields_to_drop, axis=1)
print(f"The shape of Training data features is {data_X.shape}")


test = pd.get_dummies(data=test, columns=dummy_fields)
test = test.drop(fields_to_drop, axis=1)
print(f"The shape of test data features is {test.shape}")

missing_values_test = list(set(data_X.columns) - set(test.columns))
print(f"missing values in the test set {missing_values_test}" )

for missing in missing_values_test:
    test[missing] = 0
test = test[data_X.columns]
print(f"The shape of test data features after importing missing values is {test.shape}")

The shape of Training data features is (80000, 61)
The shape of test data features is (20000, 59)
missing values in the test set ['dep_protocol_CLO', 'dep_protocol_ACC']
The shape of test data features after importing missing values is (20000, 61)


In [None]:
# data_X.isna().any()
data_X.columns[data_X.isna().any()].tolist()
# data_X= data_X.dropna()
data_X.shape

# target = data_X['is_nw_intruded']

(80000, 61)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_X, target, test_size=0.25, random_state=42, stratify=target)

In [None]:
# baseline model
def create_baseline():
	# create model
	model = Sequential()
	model.add(Dense(61, input_dim=61, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

# evaluate model with standardized dataset
estimator = KerasClassifier(model=create_baseline, epochs=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
# results = cross_val_score(estimator, X_train, y_train.values.flatten(), cv=kfold, n_jobs=-1)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
results = cross_val_score(estimator, X_train, y_train.values.flatten(), cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 68.19% (0.21%)


In [None]:
# evaluate baseline model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_baseline, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X_train, y_train.values.flatten(), cv=kfold, n_jobs=-1)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Standardized: 84.67% (0.32%)


In [None]:
def create_smaller():
    # create model
    model = Sequential()
    model.add(Dense(30, input_dim=60, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_smaller, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X_train, y_train.values.flatten(), cv=kfold, n_jobs=-1)
print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=61, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_larger, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X_train, y_train.values.flatten(), cv=kfold, n_jobs=-1)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(30, input_dim=60, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_larger, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X_train, y_train.values.flatten(), cv=kfold, n_jobs=-1)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

std =  StandardScaler()
data_X_1 = std.fit_transform(data_X)
test_1 = std.transform(test)

clf = KerasClassifier(model=create_larger, epochs=100, batch_size=5, verbose=0)
clf.fit(data_X_1, target.values.flatten())

In [None]:
test1 = pd.read_csv("test.csv")
res = pd.DataFrame(pd.concat([test1['packet_id'], pd.Series(pred)], axis=1))
res.columns = ["packet_id", "is_nw_intruded"]

In [None]:
res

In [None]:
res.to_csv("result.csv")