In [1]:
import pandas as pd

In [30]:
columns = ['duration', 'protocol', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
           'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
           'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
           'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
           'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
           'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
           'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'other']

In [31]:
train_df = pd.read_csv('data/nsl_kdd/KDDTrain+.txt', header=None, names=columns)
test_df = pd.read_csv('data/nsl_kdd/KDDTest+.txt', header=None, names=columns)

# Combine datasets into one dataframe
df = pd.concat([train_df, test_df])
df.shape

(148517, 43)

Check if there are any missing values

In [32]:
df.isnull().values.any()

False

### Preprocess
The original dataset contains many classes (specific attack types like 'smurf') in the attack_type attribute.  We will look at predicting both the specific attack type and the superset class {DOS, U2R, R21, Probe, and Normal}

In [33]:
df['attack_set'] = df['attack_type']

In [34]:
# DOS
df.loc[df.attack_set == 'neptune', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'back', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'land', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'pod', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'smurf', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'teardrop', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'mailbomb', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'processtable', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'udpstorm', 'attack_set'] = 'dos'
df.loc[df.attack_set == 'apache2', 'attack_set'] = 'dos' 
df.loc[df.attack_set == 'worm', 'attack_set'] = 'dos'


# User-to-Root (U2R)
df.loc[df.attack_set == 'buffer_overflow', 'attack_set'] = 'u2r' 
df.loc[df.attack_set == 'loadmodule', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'perl', 'attack_set'] = 'u2r' 
df.loc[df.attack_set == 'rootkit', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'sqlattack', 'attack_set'] = 'u2r' 
df.loc[df.attack_set == 'xterm', 'attack_set'] = 'u2r'
df.loc[df.attack_set == 'ps', 'attack_set'] = 'u2r'

# Remote-to-Local (R2L)
df.loc[df.attack_set == 'ftp_write', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'guess_passwd', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'imap', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'multihop', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'phf', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'spy', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'warezclient', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'warezmaster', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'xlock', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'xsnoop', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'snmpgetattack', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'httptunnel', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'snmpguess', 'attack_set'] = 'r2l'
df.loc[df.attack_set == 'sendmail', 'attack_set'] = 'r2l' 
df.loc[df.attack_set == 'named', 'attack_set'] = 'r2l'

# Probe attacks
df.loc[df.attack_set == 'satan', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'ipsweep', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'nmap', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'portsweep', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'saint', 'attack_set'] = 'probe'
df.loc[df.attack_set == 'mscan', 'attack_set'] = 'probe'

In [35]:
df.attack_type.unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back', 'guess_passwd',
       'ftp_write', 'multihop', 'rootkit', 'buffer_overflow', 'imap',
       'warezmaster', 'phf', 'land', 'loadmodule', 'spy', 'perl', 'saint',
       'mscan', 'apache2', 'snmpgetattack', 'processtable', 'httptunnel',
       'ps', 'snmpguess', 'mailbomb', 'named', 'sendmail', 'xterm', 'worm',
       'xlock', 'xsnoop', 'sqlattack', 'udpstorm'], dtype=object)

In [36]:
df.attack_set.unique()

array(['normal', 'dos', 'r2l', 'probe', 'u2r'], dtype=object)

In [37]:
df.head(10)

Unnamed: 0,duration,protocol,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,other,attack_set
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,dos
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,normal
5,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21,dos
6,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,21,dos
7,0,tcp,private,S0,0,0,0,0,0,0,...,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune,21,dos
8,0,tcp,remote_job,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,21,dos
9,0,tcp,private,S0,0,0,0,0,0,0,...,0.06,0.0,0.0,1.0,1.0,0.0,0.0,neptune,21,dos


In [38]:
y = df[['attack_type', 'attack_set']]
y.head()

Unnamed: 0,attack_type,attack_set
0,normal,normal
1,normal,normal
2,neptune,dos
3,normal,normal
4,normal,normal


In [39]:
type(y) 

pandas.core.frame.DataFrame

In [40]:
# remove unnecessary columns
del df['other']
del df['attack_type']
del df['attack_set']

In [41]:
X = df.copy()

In [42]:
X.head()

Unnamed: 0,duration,protocol,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0
1,0,udp,other,SF,146,0,0,0,0,0,...,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0
2,0,tcp,private,S0,0,0,0,0,0,0,...,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0
3,0,tcp,http,SF,232,8153,0,0,0,0,...,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
X.shape

(148517, 41)

#### One-hot encoding

In [44]:
# Generate One-Hot encoding
df_X = pd.get_dummies(X, drop_first=False)
df_X.shape

(148517, 122)

pd.get_dummies does a One Hot Encoder transformation on the categorical variables.  This process ensures that a machine learning algorithm treats all values the same

In [45]:
df_X.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


# Baseline with Scikit-Learn Models

In [None]:
import yellowbrick # for visualizing model performance

## Trees and Ensemble Learning

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, y['attack_set'], test_size=0.33, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

### Scale Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

### Baseline 

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report

In [None]:
svm_clf = make_pipeline(StandardScaler(), 
                        LinearSVC(C=1, 
                        loss="hinge", 
                        random_state=42))

In [None]:
svm_clf.fit(X_train, y_train)

In [None]:
y_pred = svm_clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))