## Importing necessary libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading dataset_invade.csv file

In [12]:
df = pd.read_csv("dataset_invade.csv")

## Inspecting dataset: Viewing first 10 rows

In [13]:
pd.set_option('display.max_columns', None)
first_ten_rows = df.head(10)
first_ten_rows

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,logged_in,num_compromised,count,srv_count,serror_rate,rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,attack
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,2,2,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,No
1,0,udp,other,SF,146,0,0,0,0,0,0,0,13,1,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,No
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,123,6,1.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,Yes
3,0,tcp,http,SF,232,8153,0,0,0,0,1,0,5,5,0.2,0.0,1.0,0.0,0.0,30,255,1.0,0.0,No
4,0,tcp,http,SF,199,420,0,0,0,0,1,0,30,32,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,No
5,0,tcp,private,REJ,0,0,0,0,0,0,0,0,121,19,0.0,1.0,0.16,0.06,0.0,255,19,0.07,0.07,Yes
6,0,tcp,private,S0,0,0,0,0,0,0,0,0,166,9,1.0,0.0,0.05,0.06,0.0,255,9,0.04,0.05,Yes
7,0,tcp,private,S0,0,0,0,0,0,0,0,0,117,16,1.0,0.0,0.14,0.06,0.0,255,15,0.06,0.07,Yes
8,0,tcp,remote_job,S0,0,0,0,0,0,0,0,0,270,23,1.0,0.0,0.09,0.05,0.0,255,23,0.09,0.05,Yes
9,0,tcp,private,S0,0,0,0,0,0,0,0,0,133,8,1.0,0.0,0.06,0.06,0.0,255,13,0.05,0.06,Yes


## Inspecting dataset: Determine data types and missing values

In [14]:
df_info = df.info()
df_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148517 entries, 0 to 148516
Data columns (total 24 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   duration                148517 non-null  int64  
 1   protocol_type           148517 non-null  object 
 2   service                 148517 non-null  object 
 3   flag                    148517 non-null  object 
 4   src_bytes               148517 non-null  int64  
 5   dst_bytes               148517 non-null  int64  
 6   land                    148517 non-null  int64  
 7   wrong_fragment          148517 non-null  int64  
 8   urgent                  148517 non-null  int64  
 9   hot                     148517 non-null  int64  
 10  logged_in               148517 non-null  int64  
 11  num_compromised         148517 non-null  int64  
 12  count                   148517 non-null  int64  
 13  srv_count               148517 non-null  int64  
 14  serror_rate         

## Inspecting dataset: Summary statistics

In [15]:
stats = df.describe().transpose()
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,148517.0,276.779305,2460.683,0.0,0.0,0.0,0.0,57715.0
src_bytes,148517.0,40227.949299,5409612.0,0.0,0.0,44.0,278.0,1379964000.0
dst_bytes,148517.0,17088.853593,3703525.0,0.0,0.0,0.0,571.0,1309937000.0
land,148517.0,0.000215,0.01467714,0.0,0.0,0.0,0.0,1.0
wrong_fragment,148517.0,0.020523,0.2400691,0.0,0.0,0.0,0.0,3.0
urgent,148517.0,0.000202,0.01941708,0.0,0.0,0.0,0.0,3.0
hot,148517.0,0.189379,2.01316,0.0,0.0,0.0,0.0,101.0
logged_in,148517.0,0.402789,0.4904606,0.0,0.0,0.0,1.0,1.0
num_compromised,148517.0,0.255062,22.23137,0.0,0.0,0.0,0.0,7479.0
count,148517.0,83.336561,116.7607,0.0,2.0,13.0,141.0,511.0


## Checking for missing values

In [16]:
df.isnull().sum()

duration                  0
protocol_type             0
service                   0
flag                      0
src_bytes                 0
dst_bytes                 0
land                      0
wrong_fragment            0
urgent                    0
hot                       0
logged_in                 0
num_compromised           0
count                     0
srv_count                 0
serror_rate               0
rerror_rate               0
same_srv_rate             0
diff_srv_rate             0
srv_diff_host_rate        0
dst_host_count            0
dst_host_srv_count        0
dst_host_same_srv_rate    0
dst_host_diff_srv_rate    0
attack                    0
dtype: int64

## One Hot Encoding for protocol_type, service & flag columns

In [17]:
#use df_encoded to train and test machine
df_encoded = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'])
df_encoded

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,logged_in,num_compromised,count,srv_count,serror_rate,rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,attack,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,service_csnet_ns,service_ctf,service_daytime,service_discard,service_domain,service_domain_u,service_echo,service_eco_i,service_ecr_i,service_efs,service_exec,service_finger,service_ftp,service_ftp_data,service_gopher,service_harvest,service_hostnames,service_http,service_http_2784,service_http_443,service_http_8001,service_imap4,service_iso_tsap,service_klogin,service_kshell,service_ldap,service_link,service_login,service_mtp,service_name,service_netbios_dgm,service_netbios_ns,service_netbios_ssn,service_netstat,service_nnsp,service_nntp,service_ntp_u,service_other,service_pm_dump,service_pop_2,service_pop_3,service_printer,service_private,service_red_i,service_remote_job,service_rje,service_shell,service_smtp,service_sql_net,service_ssh,service_sunrpc,service_supdup,service_systat,service_telnet,service_tftp_u,service_tim_i,service_time,service_urh_i,service_urp_i,service_uucp,service_uucp_path,service_vmnet,service_whois,flag_OTH,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,2,2,0.0,0.0,1.00,0.00,0.00,150,25,0.17,0.03,No,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,0,146,0,0,0,0,0,0,0,13,1,0.0,0.0,0.08,0.15,0.00,255,1,0.00,0.60,No,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,0,0,0,0,0,0,0,0,0,123,6,1.0,0.0,0.05,0.07,0.00,255,26,0.10,0.05,Yes,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,0,232,8153,0,0,0,0,1,0,5,5,0.2,0.0,1.00,0.00,0.00,30,255,1.00,0.00,No,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
4,0,199,420,0,0,0,0,1,0,30,32,0.0,0.0,1.00,0.00,0.09,255,255,1.00,0.00,No,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148512,0,794,333,0,0,0,0,1,0,1,1,0.0,0.0,1.00,0.00,0.00,100,141,0.72,0.06,No,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
148513,0,317,938,0,0,0,0,1,0,2,11,0.0,0.0,1.00,0.00,0.18,197,255,1.00,0.00,No,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
148514,0,54540,8314,0,0,0,2,1,1,5,10,0.0,0.0,1.00,0.00,0.20,255,255,1.00,0.00,Yes,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
148515,0,42,42,0,0,0,0,0,0,4,6,0.0,0.0,1.00,0.00,0.33,255,252,0.99,0.01,No,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
