## INTRUSION DETECTION SYSTEM USING KDD DATASET

In [44]:
import numpy as np
import pandas as pd
pd.options.display.max_columns=50

In [7]:
datacols = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack"]

In [14]:
data=pd.read_csv('./kddcup.data.corrected',names=datacols,nrows=100000)

###  <font color = 'red'> NOTE: Actual data file contains 48 lacs data points , but using this much data would make computation and processing very slow . Hence , we are taking only 1 lac points</font>

In [15]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
duration                       100000 non-null int64
protocol_type                  100000 non-null object
service                        100000 non-null object
flag                           100000 non-null object
src_bytes                      100000 non-null int64
dst_bytes                      100000 non-null int64
land                           100000 non-null int64
wrong_fragment                 100000 non-null int64
urgent                         100000 non-null int64
hot                            100000 non-null int64
num_failed_logins              100000 non-null int64
logged_in                      100000 non-null int64
num_compromised                100000 non-null int64
root_shell                     100000 non-null int64
su_attempted                   100000 non-null int64
num_root                       100000 non-null int64
num_file_creations             100000 n

In [29]:
data['attack'].value_counts()

normal.             77888
smurf.              22093
neptune.               15
buffer_overflow.        2
loadmodule.             1
perl.                   1
Name: attack, dtype: int64

#### Map attack field to attack class
NSL-KDD dataset has 42 attributes for each connection record including class label containing attack types. The attack types are categorized into four attack classes as described by Mahbod Tavallaee et al. in A Detailed analysis of the KDD CUP 99 Data Set as:
<br>
<p>
    <b>Denial of Service (DoS)</b>: is an attack in which an adversary directed a deluge of traffic requests to a system in order to make the computing or memory resource too busy or too full to handle legitimate requests and in the process, denies legitimate users access to a machine.
</p>
<br>
<p>
    <b>Probing Attack (Probe)</b>: probing network of computers to gather information to be used to compromise its security controls.
</p>
<br>
<p>
    <b>User to Root Attack (U2R)</b>: a class of exploit in which the adversary starts out with access to a normal user account on the system (gained either by sniffing passwords, a dictionary attack, or social engineering) and is able to exploit some vulnerability to gain root access to the system.
</p>
<br>
<p>
    <b>Remote to Local Attack (R2L)</b>: occurs when an attacker who has the ability to send packets to a machine over a network but who does not have an account on that machine exploits some vulnerability to gain local access as a user of that machine.
</p>


In [33]:
mapping = {'ipsweep.': 'Probe','satan.': 'Probe','nmap.': 'Probe','portsweep.': 'Probe','saint.': 'Probe','mscan.': 'Probe',
        'teardrop.': 'DoS','pod.': 'DoS','land.': 'DoS','back.': 'DoS','neptune.': 'DoS','smurf.': 'DoS','mailbomb.': 'DoS',
        'udpstorm.': 'DoS','apache2.': 'DoS','processtable.': 'DoS',
        'perl.': 'U2R','loadmodule.': 'U2R','rootkit.': 'U2R','buffer_overflow.': 'U2R','xterm.': 'U2R','ps.': 'U2R',
        'sqlattack.': 'U2R','httptunnel.': 'U2R',
        'ftp_write.': 'R2L','phf.': 'R2L','guess_passwd.': 'R2L','warezmaster.': 'R2L','warezclient.': 'R2L','imap.': 'R2L',
        'spy.': 'R2L','multihop.': 'R2L','named.': 'R2L','snmpguess.': 'R2L','worm.': 'R2L','snmpgetattack.': 'R2L',
        'xsnoop.': 'R2L','xlock.': 'R2L','sendmail.': 'R2L',
        'normal.': 'Normal'
        }

In [34]:
data['attack_class']=data['attack'].apply(lambda x: mapping[x])
data.drop('attack',axis=1,inplace=True)
data.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_class
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Normal
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,Normal


### <font color='green'>Splitting the data into Train and Test </font>

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
Y=data['attack_class']
data.drop('attack_class',axis=1,inplace=True)

In [38]:
Y.head()

0    Normal
1    Normal
2    Normal
3    Normal
4    Normal
Name: attack_class, dtype: object

In [39]:
X_train,X_test,y_train,y_test= train_test_split(data,Y,train_size=0.7,random_state=42)

In [40]:
X_train.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
76513,6,tcp,smtp,SF,1105,341,0,0,0,0,...,145,249,0.97,0.01,0.01,0.01,0.0,0.0,0.0,0.0
60406,0,tcp,http,SF,274,11084,0,0,0,0,...,221,255,1.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0
27322,0,tcp,http,SF,200,9018,0,0,0,0,...,139,255,1.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0


### <font color='purple'>EXPLORATORY DATA ANALYSIS </font>

In [45]:
X_train.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,3.103386,518.569729,3218.456114,0.0,0.0,1.4e-05,0.033914,1.4e-05,0.7577,0.007943,0.000357,0.000143,0.009143,0.002671,8.6e-05,0.000929,0.0,0.0,0.002671,118.531886,120.390829,0.001014,0.001033,0.000886,0.000886,0.997889,0.00388,0.097009,151.023529,241.106743,0.978498,0.004831,0.272949,0.017793,0.00109,0.000486,0.000893,0.000894
std,215.581583,750.617064,8009.153672,0.0,0.0,0.00378,0.781775,0.00378,0.428478,1.111992,0.018895,0.016903,1.278893,0.195468,0.01069,0.037974,0.0,0.0,0.051617,207.044473,206.129692,0.022592,0.022948,0.029748,0.029748,0.033027,0.060625,0.22683,106.105197,47.668954,0.110707,0.030598,0.414478,0.030709,0.019227,0.015593,0.029526,0.029621
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,222.0,245.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,34.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,293.0,853.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,12.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,179.0,255.0,1.0,0.0,0.03,0.01,0.0,0.0,0.0,0.0
75%,0.0,1032.0,2688.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,36.0,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255.0,255.0,1.0,0.0,0.5,0.03,0.0,0.0,0.0,0.0
max,40504.0,49765.0,271733.0,0.0,0.0,1.0,30.0,1.0,1.0,217.0,1.0,2.0,247.0,29.0,2.0,4.0,0.0,0.0,1.0,511.0,511.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,255.0,255.0,1.0,1.0,1.0,1.0,1.0,0.94,1.0,1.0


### We can see that for land, wrong_fragment, num_outbound_cmds, is_host_login --> all values are zero , hence they are redundant columns and therefore removing them both from train as well as test data

In [47]:
X_train.drop(columns=['land','wrong_fragment','num_outbound_cmds','is_host_login'],axis=1,inplace=True)
X_test.drop(columns=['land','wrong_fragment','num_outbound_cmds','is_host_login'],axis=1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [50]:
X_train.shape,X_test.shape

((70000, 37), (30000, 37))

In [68]:
type(y_train.value_counts())

pandas.core.series.Series

In [72]:
# Attack Class Distribution
attack_class_freq_train = pd.DataFrame(y_train.value_counts())
attack_class_freq_test = pd.DataFrame(y_test.value_counts())
attack_class_freq_train['frequency_percent_train'] = round((100 * attack_class_freq_train / attack_class_freq_train.sum()),2)
attack_class_freq_test['frequency_percent_test'] = round((100 * attack_class_freq_test / attack_class_freq_test.sum()),2)

attack_class_dist = pd.concat([attack_class_freq_train,attack_class_freq_test], axis=1,sort=False) 
attack_class_dist

Unnamed: 0,attack_class,frequency_percent_train,attack_class.1,frequency_percent_test
Normal,54505,77.86,23383,77.94
DoS,15493,22.13,6615,22.05
U2R,2,0.0,2,0.01
