## Data Collection & Preprocessing

In [1]:
# Checking the packages
! pip install numpy
! pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [3]:
# Loading the dataset

headers = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 
           'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 
           'num_failed_logins', 'logged_in', 'lnum_compromised', 'lroot_shell',
           'lsu_attempted', 'lnum_root', 'lnum_file_creations', 'lnum_shells',
           'lnum_access_files', 'lnum_outbound_cmds', 'is_host_login',
           'is_guest_login', 'count', 'srv_count', 'serror_rate',
           'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
           'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
           'dst_host_srv_count', 'dst_host_same_srv_rate', 
           'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
           'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
           'dst_host_srv_rerror_rate', 'label']

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [4]:
df.shape

(494021, 42)

In [5]:
# Checking missing values
df.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
lnum_compromised               0
lroot_shell                    0
lsu_attempted                  0
lnum_root                      0
lnum_file_creations            0
lnum_shells                    0
lnum_access_files              0
lnum_outbound_cmds             0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494021 entries, 0 to 494020
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494021 non-null  int64  
 1   protocol_type                494021 non-null  object 
 2   service                      494021 non-null  object 
 3   flag                         494021 non-null  object 
 4   src_bytes                    494021 non-null  int64  
 5   dst_bytes                    494021 non-null  int64  
 6   land                         494021 non-null  int64  
 7   wrong_fragment               494021 non-null  int64  
 8   urgent                       494021 non-null  int64  
 9   hot                          494021 non-null  int64  
 10  num_failed_logins            494021 non-null  int64  
 11  logged_in                    494021 non-null  int64  
 12  lnum_compromised             494021 non-null  int64  
 13 

In [7]:
df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,lnum_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,...,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0
mean,47.979302,3025.61,868.5324,4.5e-05,0.006433,1.4e-05,0.034519,0.000152,0.148247,0.010212,...,232.470778,188.66567,0.75378,0.030906,0.601935,0.006684,0.176754,0.176443,0.058118,0.057412
std,707.746472,988218.1,33040.0,0.006673,0.134805,0.00551,0.782103,0.01552,0.355345,1.798326,...,64.74538,106.040437,0.410781,0.109259,0.481309,0.042133,0.380593,0.380919,0.23059,0.23014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,5155468.0,1.0,3.0,3.0,30.0,5.0,1.0,884.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Preprocessing the dataset
from sklearn import preprocessing

# Min max scaling
num_features =  df.select_dtypes(include=np.number).columns.tolist()[:-1]
df[num_features] = preprocessing.MinMaxScaler().fit_transform(df[num_features])

# Label encoding 
LE = preprocessing.LabelEncoder()
df['label'] = LE.fit_transform(df['label'])
df['protocol_type'] = LE.fit_transform(df['protocol_type'])
df['service'] = LE.fit_transform(df['service'])
df['flag'] = LE.fit_transform(df['flag'])

In [9]:
# Train Test Split the dataset
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.3, random_state=42) 

print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (345814, 41) (345814,)
Test set: (148207, 41) (148207,)


In [10]:
# PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

input = [('scaler', StandardScaler()), ('pca', PCA())]
PIPE = Pipeline(input)
PIPE.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA())])

## Machine Learning Algorithms

In [11]:
## Decision tree
from sklearn.tree import DecisionTreeClassifier
from time import time

# Fitting into model
DT = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
start = time()
DT.fit(X_train, y_train)
yhat_dt = DT.predict(X_test)
print(f'Time taken to run: {time() - start} seconds')

print('Train score: %.2f' %  DT.score(X_train, y_train))
print('Test score: %.2f' % DT.score(X_test, y_test))

# Evaluating the model
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
print('Accuracy: %.2f' % accuracy_score(yhat_dt, y_test))
print('Precision: %.2f' % precision_score(yhat_dt, y_test, average="weighted"))
print('Recall: %.2f' % recall_score(y_test, yhat_dt, average="weighted"))
print('F1-score: %.2f' % f1_score(y_test, yhat_dt, average='weighted'))

Time taken to run: 1.100856065750122 seconds
Train score: 0.99
Test score: 0.99
Accuracy: 0.99
Precision: 1.00
Recall: 0.99
F1-score: 0.98


In [12]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from time import time

RFC = RandomForestClassifier(n_estimators = 5, random_state=42)
start = time()
RFC.fit(X_train, y_train)
yhat_rfc = RFC.predict(X_test)
print(f'Time taken to run: {time() - start} seconds')

print('Train score: %.2f' % RFC.score(X_train, y_train))
print('Test score: %.2f' % RFC.score(X_test, y_test))

# Evaluating the model
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
print('Accuracy: %.2f' % accuracy_score(yhat_rfc, y_test))
print('Precision: %.2f' % precision_score(yhat_rfc, y_test, average="weighted"))
print('Recall: %.2f' % recall_score(y_test, yhat_rfc, average="weighted"))
print('F1-score: %.2f' % f1_score(y_test, yhat_rfc, average='weighted'))

Time taken to run: 1.7626044750213623 seconds
Train score: 1.00
Test score: 1.00
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00


In [13]:
## SVM
from sklearn.svm import SVC
from time import time

# Fitting into model
SVM = SVC(kernel = 'rbf')
start = time()
SVM.fit(X_train, y_train) 
yhat_svm = SVM.predict(X_test)
print(f'Time taken to run: {time() - start} seconds')

# Evaluating the model
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
print('Accuracy: %.2f' % accuracy_score(yhat_svm, y_test))
print('Precision: %.2f' % precision_score(yhat_svm, y_test, average="weighted"))
print('Recall: %.2f' % recall_score(y_test, yhat_svm, average="weighted"))
print('F1-score: %.2f' % f1_score(y_test, yhat_svm, average='weighted'))

Time taken to run: 475.83001947402954 seconds
Accuracy: 0.99
Precision: 1.00
Recall: 0.99
F1-score: 0.99


In [14]:
## Gaussian Naive bayes
from sklearn.naive_bayes import GaussianNB
from time import time

NB = GaussianNB()
start = time()
NB.fit(X_train, y_train)
yhat_nb = NB.predict(X_test)
print(f'Time taken to run: {time() - start} seconds')

# Evaluating the model
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
print('Accuracy: %.2f' % accuracy_score(yhat_nb, y_test))
print('Precision: %.2f' % precision_score(yhat_nb, y_test, average="weighted"))
print('Recall: %.2f' % recall_score(y_test, yhat_nb, average="weighted"))
print('F1-score: %.2f' % f1_score(yhat_nb, y_test, average='weighted'))

Time taken to run: 1.4099063873291016 seconds
Accuracy: 0.90
Precision: 0.92
Recall: 0.90
F1-score: 0.87


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## KNN
from sklearn.neighbors import KNeighborsClassifier 
from time import time

# Building model with k=7
k = 7
NEIGH = KNeighborsClassifier(n_neighbors = k)
start = time()
NEIGH.fit(X_train,y_train)
yhat_knn = NEIGH.predict(X_test)
print(f'Time taken to run: {time() - start} seconds')

print('Train score: %.2f' % NEIGH.score(X_train, y_train))
print('Test score: %.2f' % NEIGH.score(X_test, y_test))

# Evaluating the model
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
print('Accuracy: %.2f' % accuracy_score(yhat_knn, y_test))
print('Precision: %.2f' % precision_score(yhat_knn, y_test, average="weighted"))
print('Recall: %.2f' % recall_score(y_test, yhat_knn, average="weighted"))
print('F1-score: %.2f' % f1_score(y_test, yhat_knn, average='weighted'))

Time taken to run: 1074.5991854667664 seconds
Train score: 1.00
Test score: 1.00
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00
