Importing Libraries

In [1]:

import pandas as pd
import urllib
import csv

Reading the Dataset

In [2]:
data = pd.read_csv("main_data.csv")

print(data.head())

   Duration  src_bytes  dst_bytes  logged_in  Count    Class
0         0        181       5450          1      8  normal.
1         0        239        486          1      8  normal.
2         0        235       1337          1      8  normal.
3         0        219       1337          1      6  normal.
4         0        217       2032          1      6  normal.


### Number of total classes of outputs available

In [3]:
l=data['Class'].unique()
print("The total number of output classes possible: ",end="=>")
print(len(l))

The total number of output classes possible: =>40


In [4]:
print("The classes are: ",l)

The classes are:  ['normal.' 'buffer_overflow.' 'loadmodule.' 'perl.' 'neptune.' 'smurf.'
 'guess_passwd.' 'pod.' 'teardrop.' 'portsweep.' 'ipsweep.' 'land.'
 'ftp_write.' 'back.' 'imap.' 'satan.' 'phf.' 'nmap.' 'multihop.'
 'warezmaster.' 'warezclient.' 'spy.' 'rootkit.' 'snmpgetattack.' 'named.'
 'xlock.' 'xsnoop.' 'sendmail.' 'saint.' 'apache2.' 'udpstorm.' 'xterm.'
 'mscan.' 'processtable.' 'ps.' 'httptunnel.' 'worm.' 'mailbomb.'
 'sqlattack.' 'snmpguess.']


Dependent and Independent columns

In [5]:
X = data[["Duration", "src_bytes", "dst_bytes", "logged_in","Count"]]
Y = data["Class"]

Splitting the dataset

In [6]:
from sklearn.model_selection import train_test_split
# Split the dataset into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=20)

Scaling the independent variable

In [7]:
from sklearn.preprocessing import StandardScaler
# Feature scaling
SS = StandardScaler()
X_train_sc = SS.fit_transform(X_train)
X_test_sc= SS.transform(X_test)

## Trainning the dataset using Naive bayes classifier (Conditional Probability)

In [8]:
from sklearn.naive_bayes import GaussianNB
# Instantiate the model
GNB = GaussianNB()
# Fit the model
GNB.fit(X_train_sc, Y_train)

In [9]:
Y_pred_gnb=GNB.predict(X_test_sc)
Y_pred_gnb

array(['warezclient.', 'smurf.', 'smurf.', ..., 'neptune.', 'smurf.',
       'neptune.'], dtype='<U16')

In [10]:
Y_test

451879     normal.
563239      smurf.
155016      smurf.
350585    neptune.
414137      smurf.
            ...   
285643      smurf.
784407     normal.
351729    neptune.
548782      smurf.
110054    neptune.
Name: Class, Length: 241515, dtype: object

Classification Metrics

In [11]:
from sklearn.metrics import classification_report,accuracy_score

In [12]:
acc_gnb=accuracy_score(Y_test,Y_pred_gnb)

In [13]:
print("The accuracy score for Naive Bayes Classifier is ",acc_gnb)

The accuracy score for Naive Bayes Classifier is  0.8774775893836821


## Trainning the dataset using Decision Tree Classifier

In [14]:
from sklearn.tree import DecisionTreeClassifier
# Instantiate the model
DTC=DecisionTreeClassifier()# Fit the model
DTC.fit(X_train_sc, Y_train)

In [15]:
Y_pred_dtc=DTC.predict(X_test_sc)

In [16]:
X_test_arr=X_test.values

### Number of "normal" outputs for the first 1lakh inputs

In [17]:
sum=0
for i in range(100000):
    if (data['Class'][i]=='normal.'):
        sum+=1
print(sum)

56237


### Outputs for the first 100 inputs...

In [18]:
a=['Duration', 'src_bytes', 'dst_bytes', 'logged_in', 'Count']

In [19]:
print(a)
for i in range(100):
    print(X_test_arr[i],end="=> ")
    print(Y_pred_dtc[i])

['Duration', 'src_bytes', 'dst_bytes', 'logged_in', 'Count']
[   0 1376    0    1    1]=> normal.
[   0 1032    0    0  511]=> smurf.
[   0 1032    0    0  507]=> smurf.
[ 0  0  0  0 77]=> neptune.
[  0 520   0   0 468]=> smurf.
[2634  147  105    0    1]=> normal.
[   0 1032    0    0  511]=> smurf.
[   0 1032    0    0  510]=> smurf.
[  0 520   0   0 511]=> smurf.
[  0   0   0   0 246]=> neptune.
[  0 311 810   0   1]=> normal.
[   0 1032    0    0  511]=> smurf.
[  0 520   0   0 510]=> smurf.
[  0 105 146   0   1]=> normal.
[   0 1032    0    0  511]=> smurf.
[   0  211 3642    1    9]=> normal.
[   0 1032    0    0  511]=> smurf.
[   0 1032    0    0  511]=> smurf.
[  0   0   0   0 282]=> neptune.
[   0 1032    0    0  511]=> smurf.
[    0   252 22727     0     1]=> normal.
[   0 1032    0    0  509]=> smurf.
[  0   0   0   0 265]=> neptune.
[   0 1032    0    0  511]=> smurf.
[   0 1032    0    0  511]=> smurf.
[  0 273 566   0   7]=> normal.
[ 0 46  0  0  4]=> snmpguess.
[   0 10

In [20]:
Y_test

451879     normal.
563239      smurf.
155016      smurf.
350585    neptune.
414137      smurf.
            ...   
285643      smurf.
784407     normal.
351729    neptune.
548782      smurf.
110054    neptune.
Name: Class, Length: 241515, dtype: object

Classification Metrics

In [21]:
acc_dtc=accuracy_score(Y_test,Y_pred_dtc)

In [22]:
print("The accuracy score for Decision Tree Classifier is ",acc_dtc)

The accuracy score for Decision Tree Classifier is  0.9857193135001967


## Trainning the dataset using Random Forest Classifier (Ensemble of classifiers (Boosting))

In [23]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [24]:
rfc.fit(X_train_sc,Y_train)

In [25]:
Y_pred_rfc=rfc.predict(X_test_sc)

In [27]:
Y_pred_rfc

array(['normal.', 'smurf.', 'smurf.', ..., 'neptune.', 'smurf.',
       'neptune.'], dtype=object)

In [28]:
Y_test

451879     normal.
563239      smurf.
155016      smurf.
350585    neptune.
414137      smurf.
            ...   
285643      smurf.
784407     normal.
351729    neptune.
548782      smurf.
110054    neptune.
Name: Class, Length: 241515, dtype: object

In [30]:
acc_rfc=accuracy_score(Y_pred_rfc,Y_test)
print("The accuracy score for Random Forest Classifier is")
print(acc_rfc)

The accuracy score for Random Forest Classifier is
0.9859511831563257
