## 0.Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

###################################
from sklearn import preprocessing
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier


#####################################
sns.set_style('whitegrid')

font = {'weight' : 'bold',
        'size'   : 14}

%matplotlib inline
#####################################
# SET PATH
PATH = 'Dataset/NSL-KDD/'

## 1.Datasets

In [2]:
df = pd.read_csv(PATH+'combined_df.csv')
df_test = pd.read_csv(PATH+'combined_test_df.csv')
nor_df = pd.read_csv(PATH+'normal_df.csv')

**Train dataframe**

In [3]:
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


**TEST Dataframe**

In [4]:
df_test.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2,12983,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,15,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
nor_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,REJ,RSTO,RSTOS0,RSTR,S0,S1,S2,S3,SF,SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,287,2251,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


**Features names**

In [6]:
[print() if i%10 == 0 else print(j, end = ', ') for i,j in enumerate(df.columns)]
print('\n','*-'*50)
[print() if i%10 == 0 else print(j, end = ', ') for i,j in enumerate(df_test.columns)]
''


src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, 
su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, 
serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, 
dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, label, protocol_icmp, 
protocol_udp, service_IRC, service_X11, service_Z39_50, service_aol, service_auth, service_bgp, service_courier, service_csnet_ns, 
service_daytime, service_discard, service_domain, service_domain_u, service_echo, service_eco_i, service_ecr_i, service_efs, service_exec, 
service_ftp, service_ftp_data, service_gopher, service_harvest, service_hostnames, service_http, service_http_2784, service_http_443, service_

''

### 1.x Different dataset for different categories. 
<br>
This will aid to create classification models which are independent of each others. 

In [7]:
# 0=normal, 1=DoS, 2=Probe, 3=R2L and 4=U2R.
normal_df = df[df['label'] == 0]

dos_df = df[df['label'] == 1]
dos_df = pd.concat([dos_df,normal_df])

probe_df = df[df['label'] == 2]
probe_df = pd.concat([probe_df,normal_df])

r2l_df = df[df['label'] == 3]
r2l_df = pd.concat([r2l_df,normal_df])

u2r_df = df[df['label'] == 4]
u2r_df = pd.concat([u2r_df,normal_df])

# TEST DATASET

Normal_df_test = df_test[df_test['label'] == 0]

Dos_df_test = df_test[df_test['label'] == 1]
Dos_df_test = pd.concat([Normal_df_test,Dos_df_test])

Probe_df_test = df_test[df_test['label'] == 2]
Probe_df_test = pd.concat([Normal_df_test,Probe_df_test])

R2L_df_test = df_test[df_test['label'] == 3]
R2L_df_test = pd.concat([Normal_df_test,R2L_df_test])

U2R_df_test = df_test[df_test['label'] == 4]
U2R_df_test = pd.concat([Normal_df_test,U2R_df_test])

## 1.3 Save the dataset (updated)

In [8]:
'''
# Train dataset!

dos_df.to_csv(PATH+'dos_df_train.csv',index = False)
probe_df.to_csv(PATH+'probe_df_train.csv',index = False)
r2l_df.to_csv(PATH+'r2l_df_train.csv',index = False)
u2r_df.to_csv(PATH+'u2r_df_train.csv',index = False)

# Test Dataset!

Dos_df_test.to_csv(PATH+'dos_df_test.csv',index = False)
Probe_df_test.to_csv(PATH+'probe_df_test.csv',index = False)
R2L_df_test.to_csv(PATH+'r2l_df_test.csv',index = False)
U2R_df_test.to_csv(PATH+'u2r_df_test.csv',index = False)
'''
;


''

## 2. Preprocessing

In [9]:
print('Train:')
print('Dimensions of DoS:' ,dos_df.shape)
print('Dimensions of Probe:' ,probe_df.shape)
print('Dimensions of R2L:' ,r2l_df.shape)
print('Dimensions of U2R:' ,u2r_df.shape)
print('Test:')
print('Dimensions of DoS:' ,Dos_df_test.shape)
print('Dimensions of Probe:' ,Probe_df_test.shape)
print('Dimensions of R2L:' ,R2L_df_test.shape)
print('Dimensions of U2R:' ,U2R_df_test.shape)

Train:
Dimensions of DoS: (113270, 123)
Dimensions of Probe: (78999, 123)
Dimensions of R2L: (68338, 123)
Dimensions of U2R: (67395, 123)
Test:
Dimensions of DoS: (17171, 123)
Dimensions of Probe: (12132, 123)
Dimensions of R2L: (12596, 123)
Dimensions of U2R: (9778, 123)


In [10]:
X_DoS = dos_df.drop('label',1)
Y_DoS = dos_df.label
X_Probe = probe_df.drop('label',1)
Y_Probe = probe_df.label
X_R2L = r2l_df.drop('label',1)
Y_R2L = r2l_df.label
X_U2R = u2r_df.drop('label',1)
Y_U2R = u2r_df.label

# test set
X_DoS_test = Dos_df_test.drop('label',1)
Y_DoS_test = Dos_df_test.label
X_Probe_test = Probe_df_test.drop('label',1)
Y_Probe_test = Probe_df_test.label
X_R2L_test = R2L_df_test.drop('label',1)
Y_R2L_test = R2L_df_test.label
X_U2R_test = U2R_df_test.drop('label',1)
Y_U2R_test = U2R_df_test.label


In [11]:
Normal_df_test.shape

(9711, 123)

In [12]:
X_DoS.shape

(113270, 122)

### 2.1 Scaler

In [13]:
scaler1 = preprocessing.StandardScaler().fit(X_DoS)
X_DoS=scaler1.transform(X_DoS) 
scaler2 = preprocessing.StandardScaler().fit(X_Probe)
X_Probe=scaler2.transform(X_Probe) 
scaler3 = preprocessing.StandardScaler().fit(X_R2L)
X_R2L=scaler3.transform(X_R2L) 
scaler4 = preprocessing.StandardScaler().fit(X_U2R)
X_U2R=scaler4.transform(X_U2R) 

  return self.partial_fit(X, y)
  
  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  return self.partial_fit(X, y)
  
  return self.partial_fit(X, y)
  


In [14]:
print(X_DoS.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1.]


# 3.Univariate feature selection

In [15]:
col_names  = df.columns

In [16]:
probe_df.columns[56]

'service_eco_i'

In [17]:
df.columns[56]

'service_eco_i'

In [18]:
'label' in col_names

True

In [19]:
np.seterr(divide='ignore', invalid='ignore');
selector = SelectPercentile(f_classif, percentile=10)

In [20]:
def get_univariate_features(X, y, col = col_names):
    x_ids = selector.fit_transform(X,y)
    true = selector.get_support()
    newcolindex_=[i for i, x in enumerate(true) if x]
    newcolname_=list(col[i] for i in newcolindex_)
    return newcolname_,x_ids

>**DOS**

In [21]:
Dos_uf_col_train, Dos_uf_train = get_univariate_features(X_DoS, Y_DoS)
Dos_uf_col_train



['logged_in',
 'count',
 'serror_rate',
 'srv_serror_rate',
 'same_srv_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'service_hostnames',
 'flag_RSTR',
 'flag_S3']

> **Probe**

In [22]:
Probe_uf_col_train, Probe_uf_train = get_univariate_features(X_Probe, Y_Probe)
Probe_uf_col_train



['logged_in',
 'rerror_rate',
 'srv_rerror_rate',
 'dst_host_srv_count',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'label',
 'service_echo',
 'service_printer',
 'flag_S3']

> **R2L**

In [23]:
R2L_uf_col_train, R2L_uf_train = get_univariate_features(X_R2L, Y_R2L)
R2L_uf_col_train

  68  70  71  72  73  74  76  77  78  79  80  81  82  83  86  87  89  92
  93  96  98  99 100 107 108 109 110 114] are constant.


['src_bytes',
 'dst_bytes',
 'hot',
 'num_failed_logins',
 'is_guest_login',
 'dst_host_srv_count',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'service_finger',
 'service_ftp',
 'service_hostnames',
 'service_http_8001',
 'flag_REJ']

> **U2R**

In [24]:
U2R_uf_col_train, U2R_uf_train = get_univariate_features(X_U2R, Y_U2R)
U2R_uf_col_train

  68  70  71  72  73  74  75  76  77  78  79  80  81  82  83  86  87  89
  92  93  96  98  99 100 107 108 109 110 114] are constant.


['urgent',
 'hot',
 'root_shell',
 'num_file_creations',
 'num_shells',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'service_ftp',
 'service_hostnames',
 'service_systat']

## Results from UFS

In [25]:
print('Features :: DoS ')
print('-----'*10)
print('\n'.join(Dos_uf_col_train),'\n')
print('=*=*='*10,'\n')
      
print('Features :: Probe ')
print('-----'*10)
print('\n'.join(Probe_uf_col_train),'\n')
print('=*=*='*10,'\n')
      
print('Features :: R2L')
print('-----'*10)
print('\n'.join(R2L_uf_col_train),'\n')
print('=*=*='*10,'\n')
      
print('Features :: U2R')
print('-----'*10)
print('\n'.join(U2R_uf_col_train),'\n')
print('=*=*='*10,'\n')

Features :: DoS 
--------------------------------------------------
logged_in
count
serror_rate
srv_serror_rate
same_srv_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_serror_rate
dst_host_srv_serror_rate
service_hostnames
flag_RSTR
flag_S3 

=*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*= 

Features :: Probe 
--------------------------------------------------
logged_in
rerror_rate
srv_rerror_rate
dst_host_srv_count
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
label
service_echo
service_printer
flag_S3 

=*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*= 

Features :: R2L
--------------------------------------------------
src_bytes
dst_bytes
hot
num_failed_logins
is_guest_login
dst_host_srv_count
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
service_finger
service_ftp
service_hostnames
service_http_8001
flag_REJ 

=*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*==*=*= 

Feat

## 4. Recursive Feature elimination

 Internet helped here!!
 https://machinelearningmastery.com/feature-selection-machine-learning-python/

In [26]:
clf = DecisionTreeClassifier(random_state=0)

#rank all features
rfe = RFE(clf, n_features_to_select=1)


## 4.1 Ranking of features Obtained from UFS

In [27]:
rfe.fit(Dos_uf_train, Y_DoS)
print ("DoS Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), Dos_uf_col_train)))

DoS Features sorted by their rank:
[(1, 'same_srv_rate'), (2, 'count'), (3, 'flag_S3'), (4, 'dst_host_serror_rate'), (5, 'dst_host_same_srv_rate'), (6, 'dst_host_srv_count'), (7, 'dst_host_count'), (8, 'logged_in'), (9, 'serror_rate'), (10, 'dst_host_srv_serror_rate'), (11, 'srv_serror_rate'), (12, 'service_hostnames'), (13, 'flag_RSTR')]


In [28]:
rfe.fit(Probe_uf_train, Y_Probe)
print ("Probe Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), Probe_uf_col_train)))

Probe Features sorted by their rank:
[(1, 'dst_host_same_src_port_rate'), (2, 'dst_host_srv_count'), (3, 'dst_host_rerror_rate'), (4, 'service_printer'), (5, 'logged_in'), (6, 'dst_host_diff_srv_rate'), (7, 'dst_host_srv_diff_host_rate'), (8, 'flag_S3'), (9, 'service_echo'), (10, 'rerror_rate'), (11, 'label'), (12, 'dst_host_srv_rerror_rate'), (13, 'srv_rerror_rate')]


In [29]:
rfe.fit(R2L_uf_train, Y_R2L)
print ("R2L Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), R2L_uf_col_train)))

R2L Features sorted by their rank:
[(1, 'src_bytes'), (2, 'dst_bytes'), (3, 'hot'), (4, 'dst_host_srv_diff_host_rate'), (5, 'service_ftp'), (6, 'dst_host_same_src_port_rate'), (7, 'dst_host_srv_count'), (8, 'num_failed_logins'), (9, 'service_http_8001'), (10, 'is_guest_login'), (11, 'service_finger'), (12, 'flag_REJ'), (13, 'service_hostnames')]


In [30]:
rfe.fit(U2R_uf_train, Y_U2R)
print ("U2R Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), U2R_uf_col_train)))

U2R Features sorted by their rank:
[(1, 'hot'), (2, 'dst_host_srv_count'), (3, 'dst_host_count'), (4, 'root_shell'), (5, 'num_shells'), (6, 'service_ftp'), (7, 'dst_host_srv_diff_host_rate'), (8, 'num_file_creations'), (9, 'dst_host_same_src_port_rate'), (10, 'service_systat'), (11, 'srv_diff_host_rate'), (12, 'service_hostnames'), (13, 'urgent')]


## 4.2 From complete dataset

In [31]:
X_DoS.std(axis = 0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1.])

In [37]:
# Use some other classifier.
clf = DecisionTreeClassifier(random_state=0)
rfe = RFE(estimator=clf, n_features_to_select=13, step=1)

def get_rfe(rfe,X,y, col = col_names):
    rfe.fit(X, y)
    rfe_idx = rfe.transform(X)
    true = rfe.support_
    rfecolindex_ = [i for i, x in enumerate(true) if x]
    rfecolname_ = list(col[i] for i in rfecolindex_)
    return rfe_idx, rfecolname_


    

### RFE for DOS taking all the 122 features.

In [40]:
%%time 
RFE_dos, RFE_dos_col = get_rfe(rfe, X_DoS, Y_DoS)


Wall time: 2min 7s


In [41]:
RFE_dos_col

['src_bytes',
 'dst_bytes',
 'wrong_fragment',
 'num_compromised',
 'same_srv_rate',
 'diff_srv_rate',
 'dst_host_count',
 'dst_host_same_srv_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'service_eco_i',
 'flag_RSTOS0',
 'flag_RSTR']

**In the sameway anyone run for all the other dataset.**
<br>
Just call the function **get_rfe** for the other types of attack and summarize the results.
use the updated dataset. Columns returned from UFS, RFE or some other method which we **MAY** develop if we develop could be used with the datasets saved. 