## Project 


In [40]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random

# Import NSL KDD Dateset

In [41]:
train_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Train.csv'
test_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Test.csv'

In [42]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


df = pd.read_csv(train_url,header=None, names = col_names)

df_test = pd.read_csv(test_url, header=None, names = col_names)

print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (125973, 42)
Dimensions of the Test set: (22544, 42)


In [43]:
df.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [44]:
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178


# Data preprocessing

In [45]:
# columns are categorical, not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# One-Hot-Encoding is used to convert all categorical properties to binary properties. One-Hot-Endcoding requirement, 
# input to this transformer must be an integer matrix expressing the values received with categorical(discrete) properties. 
# The output will be a sparse matrix where each column corresponds to a possible value. It is assumed that input properties 
# take values in the range [0, n_values]. Therefore, to convert each category to a number, the properties must first be 
# converted with the LabelEncoder.

print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories

Distribution of categories in service:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64


In [46]:
# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


# LabelEncoder

Insert categorical features into a 2D numpy array

In [47]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [48]:
# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
print(unique_protocol2)

# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
print(unique_service2)


# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
print(unique_flag2)


# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2


#do it for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp']
['service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'ser

In [49]:
#Transform categorical features into numbers using LabelEncoder()
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)

print(df_categorical_values.head())
print('--------------------')
print(df_categorical_values_enc.head())

# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

  protocol_type   service flag
0           tcp  ftp_data   SF
1           udp     other   SF
2           tcp   private   S0
3           tcp      http   SF
4           tcp      http   SF
--------------------
   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


In [50]:
selected_columns=['duration', 'src_bytes', 'dst_bytes', 'count', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'label']

df_selected_values = df[selected_columns].copy()

testdf_selected_values = df_test[selected_columns].copy()
#df_selected_values.head()
testdf_selected_values.head()

Unnamed: 0,duration,src_bytes,dst_bytes,count,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,label
0,0,0,0,229,255,10,0.04,0.06,0.0,0.0,neptune
1,0,0,0,136,255,1,0.0,0.06,0.0,0.0,neptune
2,2,12983,0,1,134,86,0.61,0.04,0.61,0.02,normal
3,0,20,0,1,3,57,1.0,0.0,1.0,0.28,saint
4,1,0,15,1,29,86,0.31,0.17,0.03,0.02,mscan


In [51]:
print(testdf_selected_values.shape)

(22544, 11)


In [52]:
classes_mapped = {'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 1,'nmap' : 1,'portsweep' : 1,'satan' : 1,'mscan' : 1,'saint' : 1
                           ,'ftp_write': 1,'guess_passwd': 1,'imap': 1,'multihop': 1,'phf': 1,'spy': 1,'warezclient': 1,'warezmaster': 1,'sendmail': 1,'named': 1,'snmpgetattack': 1,'snmpguess': 1,'xlock': 1,'xsnoop': 1,'httptunnel': 1,
                            'buffer_overflow': 1,'loadmodule': 1,'perl': 1,'rootkit': 1,'ps': 1,'sqlattack': 1,'xterm': 1}
df_selected_values = df_selected_values.replace({'label':classes_mapped})
testdf_selected_values = testdf_selected_values.replace({'label':classes_mapped})
#df_selected_values.head()
testdf_selected_values.head()

Unnamed: 0,duration,src_bytes,dst_bytes,count,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,label
0,0,0,0,229,255,10,0.04,0.06,0.0,0.0,1
1,0,0,0,136,255,1,0.0,0.06,0.0,0.0,1
2,2,12983,0,1,134,86,0.61,0.04,0.61,0.02,0
3,0,20,0,1,3,57,1.0,0.0,1.0,0.28,1
4,1,0,15,1,29,86,0.31,0.17,0.03,0.02,1


In [53]:
# Create correlation matrix for selected features 
corr_matrix = df_selected_values.corr().abs().round(2)

# display correlation matrix
display(corr_matrix)

Unnamed: 0,duration,src_bytes,dst_bytes,count,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,label
duration,1.0,0.07,0.03,0.08,0.05,0.11,0.12,0.25,0.23,0.03,0.05
src_bytes,0.07,1.0,0.0,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.01
dst_bytes,0.03,0.0,1.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
count,0.08,0.01,0.0,1.0,0.47,0.4,0.47,0.17,0.14,0.21,0.58
dst_host_count,0.05,0.01,0.0,0.47,1.0,0.3,0.52,0.14,0.31,0.46,0.38
dst_host_srv_count,0.11,0.01,0.0,0.4,0.3,1.0,0.9,0.39,0.07,0.0,0.72
dst_host_same_srv_rate,0.12,0.01,0.0,0.47,0.52,0.9,1.0,0.42,0.14,0.2,0.69
dst_host_diff_srv_rate,0.25,0.0,0.01,0.17,0.14,0.39,0.42,1.0,0.23,0.02,0.24
dst_host_same_src_port_rate,0.23,0.0,0.01,0.14,0.31,0.07,0.14,0.23,1.0,0.41,0.09
dst_host_srv_diff_host_rate,0.03,0.0,0.0,0.21,0.46,0.0,0.2,0.02,0.41,1.0,0.06


In [54]:
df_selected_values.head()

Unnamed: 0,duration,src_bytes,dst_bytes,count,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,label
0,0,491,0,2,150,25,0.17,0.03,0.17,0.0,0
1,0,146,0,13,255,1,0.0,0.6,0.88,0.0,0
2,0,0,0,123,255,26,0.1,0.05,0.0,0.0,1
3,0,232,8153,5,30,255,1.0,0.0,0.03,0.04,0
4,0,199,420,30,255,255,1.0,0.0,0.0,0.0,0


In [55]:
print(df_selected_values.shape)

(125973, 11)


In [56]:
# Based on correlation matrix, select the final minimal set of features containing not highly correlated features only.

new_selected_columns=['duration', 'src_bytes', 'dst_bytes', 'count', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_src_port_rate', 'label']

# Create final datafranes for training and testing data for the minimal set of features
new_df_selected_values = df_selected_values[new_selected_columns].copy()
new_testdf_selected_values = testdf_selected_values[new_selected_columns].copy()
new_df_selected_values.head()
#new_testdf_selected_values.head()

Unnamed: 0,duration,src_bytes,dst_bytes,count,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,label
0,0,491,0,2,150,25,0.17,0
1,0,146,0,13,255,1,0.88,0
2,0,0,0,123,255,26,0.0,1
3,0,232,8153,5,30,255,0.03,0
4,0,199,420,30,255,255,0.0,0


In [57]:
train_features = new_df_selected_values.drop(['label'], axis=1)
train_labels = new_df_selected_values['label']
test_features = new_testdf_selected_values.drop(['label'], axis=1)
test_labels = new_testdf_selected_values['label']

print('Training data pts: ', len(train_features))
print('Test data pts: ', len(test_features))
print('First item: \n', train_features.iloc[0, :])
print('Label: ', train_labels[0])

#train_label = df_selected_values['label'].copy()
#print(train_label.shape)

#train_arr = [125973,6] 
#print(train_arr)

#test_X = df_selected_values['duration', 'src_bytes', 'dst_bytes', 'count', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_src_port_rate'].copy()
#pd.concat([pd.new_df_selected_values['label'], pd.new_df_selected_values['duration'], pd.new_df_selected_values['src_bytes'], pd.new_df_selected_values['dst_bytes'], pd.new_df_selected_values['count'], pd.new_df_selected_values['dst_host_count'], pd.new_df_selected_values['dst_host_srv_count'], pd.new_df_selected_values['dst_host_same_src_port_rate'], pd.DataFrame(train_X)], axis=1) \
#        .to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

Training data pts:  125973
Test data pts:  22544
First item: 
 duration                         0.00
src_bytes                      491.00
dst_bytes                        0.00
count                            2.00
dst_host_count                 150.00
dst_host_srv_count              25.00
dst_host_same_src_port_rate      0.17
Name: 0, dtype: float64
Label:  0


In [58]:
data_dir = '../data' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    os.makedirs(data_dir)

In [59]:
pd.concat([pd.DataFrame(train_labels), pd.DataFrame(train_features)],axis=1) \
        .to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [60]:
# New numeric columns are added to the main dataframe
#newdf=df.join(df_cat_data)

#import os
#os.makedirs("data")
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sentiment_rnn'

role = sagemaker.get_execution_role()

In [61]:
#labeldf=newdf['label']
#labeldf_test=newdf_test['label']


# change the label column
#newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
#                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
#                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
#                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

# put the new label column back
#newdf['label'] = newlabeldf
#newdf_test['label'] = newlabeldf_test

input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [62]:
!pygmentize train/model.py

[34mimport[39;49;00m [04m[36mtorch[39;49;00m[04m[36m.[39;49;00m[04m[36mnn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m

[34mclass[39;49;00m [04m[32mLSTMClassifier[39;49;00m(nn.Module):
    [33m"""[39;49;00m
[33m    This is the simple RNN model we will be using to perform Sentiment Analysis.[39;49;00m
[33m    """[39;49;00m

    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, embedding_dim, hidden_dim, vocab_size):
        [33m"""[39;49;00m
[33m        Initialize the model by settingg up the various layers.[39;49;00m
[33m        """[39;49;00m
        [36msuper[39;49;00m(LSTMClassifier, [36mself[39;49;00m).[32m__init__[39;49;00m()

        [36mself[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=[34m0[39;49;00m)
        [36mself[39;49;00m.lstm = nn.LSTM(embedding_dim, hidden_dim)
        [36mself[39;49;00m.dense = nn.Linear(in_features=hidden_dim, out_features=[34m1[39;49;00m)


In [3]:
%pwd


'/home/ec2-user/SageMaker/NSL-KDD-Network-Intrusion-Detection'

In [63]:
import os
import glob
import pickle

import torch
import torch.utils.data

# Read in only the first 250 rows
train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None, nrows=250)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

ModuleNotFoundError: No module named 'torch'

In [64]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # TODO: Complete this train method to train the model provided.
            
            optimizer.zero_grad()
            out = model.forward(batch_X)
            loss = loss_fn(out, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader)))

In [65]:
import torch.optim as optim
from train.model import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

train(model, train_sample_dl, 5, optimizer, loss_fn, device)

ModuleNotFoundError: No module named 'torch'