In [1]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
np.random.seed(42)
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [2]:
# Dictionary that contains mapping of various attacks to the four main categories
attack_dict = {
    'normal': 'normal',
   
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

In [3]:
train_nsl_kdd_dataset_path = "NSL_KDD_Dataset/KDDTrain+.txt"
col_names = np.array(["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels"])

categoricalColumns = col_names[[1,2,3]]
binaryColumns = col_names[[6, 11, 13, 14, 20, 21]]
numericColumns = col_names[list(set(range(41)) - set([1,2,3]) - set([6, 11, 13, 14, 20, 21]))]

train = pd.read_csv(train_nsl_kdd_dataset_path, header=None)
train.drop([42], 1, inplace=True)
train.columns = col_names

testData = pd.read_csv("NSL_KDD_Dataset/KDDTest+.txt", header=None)

testData.drop([42], 1, inplace=True)
testData.columns = col_names

mergedDataSet = pd.concat([train, testData]).reset_index(drop=True)
mergedDataSet.shape

(148517, 42)

In [4]:
# Performing all the encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
enc = LabelEncoder()
#
def encodeCategorical(ser):
    return enc.fit_transform(ser)

mergedDataSet['service'] = encodeCategorical(mergedDataSet['service'])
mergedDataSet['flag'] = encodeCategorical(mergedDataSet['flag'])
mergedDataSet = pd.get_dummies(mergedDataSet, columns=['protocol_type'])
mergedDataSet['labelsMapped'] = mergedDataSet['labels'].map(lambda x: attack_dict[x])

In [5]:
testDataSet = mergedDataSet.loc[train.shape[0]:,:]
trainDataSet = mergedDataSet.loc[:train.shape[0], :]

In [6]:
#binaryColumns = list(binaryColumns) + list(categoricalColumns[1:]) + [u'protocol_type_icmp', u'protocol_type_tcp', u'protocol_type_udp']
import sys
from collections import OrderedDict
e = sys.float_info.epsilon
def calAttributeRatio(df, numericColumns,binaryColumns):
    denom = {}
    ar = {}
    for col in numericColumns:
        denom[col] = df[col].mean();

    for col in numericColumns:
        ar[col] = df.fillna(value=0.0).groupby('labelsMapped')[[col]].mean().max().values[0]/(denom[col])

    def test_sum(series):
        return (series.sum()/(len(series)-series.sum()+e))
    for col in binaryColumns:
        groups = df.groupby('labelsMapped')[[col]]
        ar[col] = groups.aggregate([test_sum]).max().values[0]
    return ar

ar_op = calAttributeRatio(trainDataSet,numericColumns,binaryColumns)
print(OrderedDict(sorted(ar_op.items(), key=lambda v: -v[1])))
def selectTopFeaturesByAR(ar_op, min_ar):
    return [c for c in ar_op.keys() if ar_op[c]>=min_ar]
selectedFeatures = selectTopFeaturesByAR(ar_op,1.00)
train_processed_selectedFeatures = pd.concat([trainDataSet[selectedFeatures], trainDataSet[['labelsMapped', u'protocol_type_icmp', u'protocol_type_tcp', u'protocol_type_udp', u'service', u'flag']]], axis=1)
train_processed_selectedFeatures.head()

  if sys.path[0] == '':


OrderedDict([('num_shells', 326.11612426035498), ('urgent', 173.04120879120879), ('num_file_creations', 62.234118951224211), ('num_failed_logins', 46.03892188213797), ('hot', 40.774840493730785), ('logged_in', 10.569767441860465), ('dst_bytes', 9.1549270284904676), ('src_bytes', 8.4641313944594359), ('duration', 7.2258865173544695), ('dst_host_srv_diff_host_rate', 5.7569263820766396), ('dst_host_diff_srv_rate', 4.8373524734680329), ('num_access_files', 4.6949165175909364), ('dst_host_same_src_port_rate', 4.3931152520736072), ('num_compromised', 4.3385737152153654), ('diff_srv_rate', 4.0690870488934809), ('dst_host_srv_rerror_rate', 3.6677075035125544), ('srv_rerror_rate', 3.6675306730359276), ('rerror_rate', 3.6453737959679429), ('dst_host_rerror_rate', 3.2793738892262856), ('srv_diff_host_rate', 3.0815901722227639), ('wrong_fragment', 2.7428583870405849), ('dst_host_srv_serror_rate', 2.6731226120858391), ('srv_serror_rate', 2.6432097487171315), ('serror_rate', 2.6310182415379524), ('d

Unnamed: 0,num_access_files,src_bytes,srv_count,num_compromised,rerror_rate,urgent,dst_host_same_srv_rate,duration,srv_rerror_rate,srv_serror_rate,...,dst_host_srv_rerror_rate,srv_diff_host_rate,num_failed_logins,dst_host_serror_rate,labelsMapped,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service,flag
0,0,491,2,0,0.0,0,0.17,0,0.0,0.0,...,0.0,0.0,0,0.0,normal,0,1,0,20,9
1,0,146,1,0,0.0,0,0.0,0,0.0,0.0,...,0.0,0.0,0,0.0,normal,0,0,1,44,9
2,0,0,6,0,0.0,0,0.1,0,0.0,1.0,...,0.0,0.0,0,1.0,DoS,0,1,0,49,5
3,0,232,5,0,0.0,0,1.0,0,0.0,0.2,...,0.01,0.0,0,0.03,normal,0,1,0,24,9
4,0,199,32,0,0.0,0,1.0,0,0.0,0.0,...,0.0,0.09,0,0.0,normal,0,1,0,24,9


In [7]:
#removing the columns from test data set
test_processed_selectedFeatures = testDataSet[train_processed_selectedFeatures.columns]

In [68]:
#Loading the models
rand_forest_mdl = RandomForestClassifier(n_estimators = 100, max_depth = 20, class_weight="balanced")
X = train_processed_selectedFeatures.drop(['labelsMapped'], 1).reset_index(drop=True)
y = train_processed_selectedFeatures['labelsMapped'].reset_index(drop=True)
rand_forest_mdl.fit(X, y)
pred_y = rand_forest_mdl.predict(test_processed_selectedFeatures.drop(['labelsMapped'], 1))
print 'Accuracy Score', accuracy_score(test_processed_selectedFeatures['labelsMapped'], pred_y)
print 'Precision Score', precision_score(test_processed_selectedFeatures['labelsMapped'], pred_y, average = "weighted")
print 'Recall Score', recall_score(test_processed_selectedFeatures['labelsMapped'], pred_y, average = "weighted")

Accuracy Score 0.738644428673
Precision Score 0.776658972406
Recall Score 0.738644428673


In [69]:
mdl_xgb = XGBClassifier(n_estimators = 100, max_depth = 15, learning_rate = 0.01)
mdl_xgb.fit(X, y)
pred_y_xgb = mdl_xgb.predict(test_processed_selectedFeatures.drop(['labelsMapped'], 1))
print "Accuracy Score", accuracy_score(test_processed_selectedFeatures['labelsMapped'], pred_y_xgb)
print 'Precision Score', precision_score(test_processed_selectedFeatures['labelsMapped'], pred_y_xgb, average = "weighted")
print 'Recall Score', recall_score(test_processed_selectedFeatures['labelsMapped'], pred_y_xgb, average = "weighted")

Precision Score 0.819729104779
Recall Score 0.773376508162


In [8]:
#Binary Attack check
y_binary = train_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1)
rand_forest_mdl = RandomForestClassifier(n_estimators = 100, max_depth = 20, class_weight="balanced")
X = train_processed_selectedFeatures.drop(['labelsMapped'], 1).reset_index(drop=True)
rand_forest_mdl.fit(X, y_binary)
pred_y = rand_forest_mdl.predict(test_processed_selectedFeatures.drop(['labelsMapped'], 1))
print "Accuracy Score", accuracy_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_y)
print 'Precision Score', precision_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_y)
print 'Recall Score', recall_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_y)

Accuracy Score 0.776259758694
Precision Score 0.966463049467
Recall Score 0.628769578431


In [9]:
mdl_xgb_binary = XGBClassifier(n_estimators = 100, max_depth = 20, learning_rate = 1e-3)
mdl_xgb_binary.fit(X, y_binary)
pred_y_xgb_binary = mdl_xgb_binary.predict(test_processed_selectedFeatures.drop(['labelsMapped'], 1))
print "Accuracy", accuracy_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_y_xgb_binary)
print 'Precision Score', precision_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_y_xgb_binary)
print 'Recall Score', recall_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_y_xgb_binary)

Accuracy 0.770981192335
Precision Score 0.966772151899
Recall Score 0.618951141588


In [21]:
pred_y_xgb_proba = mdl_xgb_binary.predict_proba(test_processed_selectedFeatures.drop(['labelsMapped'], 1))
for cut in np.linspace(0.43, 0.47, 20):
    pred_bin = map(lambda x: 1 if x[1] >= cut else 0 ,pred_y_xgb_proba)
    acc = accuracy_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_bin)
    prec = precision_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_bin)
    recall = recall_score(test_processed_selectedFeatures['labelsMapped'].map(lambda x: 0 if x == "normal" else 1), pred_bin)
    print "Cut Off", cut
    print "Accuracy", acc, 'Precision Score', prec, 'Recall Score', recall
    print '*'*53

Cut Off 0.43
Accuracy 0.569242370476 Precision Score 0.569242370476 Recall Score 1.0
*****************************************************
Cut Off 0.432105263158
Accuracy 0.569242370476 Precision Score 0.569242370476 Recall Score 1.0
*****************************************************
Cut Off 0.434210526316
Accuracy 0.569242370476 Precision Score 0.569242370476 Recall Score 1.0
*****************************************************
Cut Off 0.436315789474
Accuracy 0.569242370476 Precision Score 0.569242370476 Recall Score 1.0
*****************************************************
Cut Off 0.438421052632
Accuracy 0.569242370476 Precision Score 0.569242370476 Recall Score 1.0
*****************************************************
Cut Off 0.440526315789
Accuracy 0.569242370476 Precision Score 0.569242370476 Recall Score 1.0
*****************************************************
Cut Off 0.442631578947
Accuracy 0.569242370476 Precision Score 0.569242370476 Recall Score 1.0
*********************

Binary Classification without AR
Random Forest
 0.76694464158978004
 
With AR
 0.77625975869410935

In [17]:
'''
Cut Off 0.452631578947
Accuracy 0.852155784244 Precision Score 0.883931458131 Recall Score 0.852177978649
'''

array([ 0.4524312,  0.5475688], dtype=float32)

In [18]:
pred_y_xgb_binary[0]

1