In [93]:
import pandas as pd
import numpy as np
import urllib
import requests

In [2]:
headers_link = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names'

In [3]:
contents = requests.get(headers_link).text
headers = [x.split(':')[0] for x in contents.split('\n')[1:]]
headers[-1] = 'label'

In [4]:
df = pd.read_csv('resources/datasets/kddcup.data_10_percent', header=None, names=headers)

In [5]:
df.shape

(494021, 42)

In [6]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [7]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [8]:
df.protocol_type.value_counts()

icmp    283602
tcp     190065
udp      20354
Name: protocol_type, dtype: int64

In [9]:
df.shape

(494021, 42)

In [10]:
df.label.value_counts(normalize=True)

smurf.              0.568377
neptune.            0.216997
normal.             0.196911
back.               0.004459
satan.              0.003216
ipsweep.            0.002524
portsweep.          0.002105
warezclient.        0.002065
teardrop.           0.001982
pod.                0.000534
nmap.               0.000468
guess_passwd.       0.000107
buffer_overflow.    0.000061
land.               0.000043
warezmaster.        0.000040
imap.               0.000024
rootkit.            0.000020
loadmodule.         0.000018
ftp_write.          0.000016
multihop.           0.000014
phf.                0.000008
perl.               0.000006
spy.                0.000004
Name: label, dtype: float64

In [11]:
df_ = df[df.label.isin(['normal.', 'ipsweep.'])]
df_ = df_.reset_index(drop=True)

In [12]:
df_.label.value_counts(normalize=True)

normal.     0.987343
ipsweep.    0.012657
Name: label, dtype: float64

In [13]:
df_.shape

(98525, 42)

In [14]:
x = df_[df.columns.difference(['label'])]
y = df_.label == 'normal.'

In [15]:
categorical_cols = [col for col in x.columns if df.dtypes[col] == object]

In [16]:
x[categorical_cols].nunique()

flag              9
protocol_type     3
service          32
dtype: int64

In [17]:
y.nunique()

2

In [18]:
x_ = pd.get_dummies(x, columns=categorical_cols)

In [19]:
x_.shape

(98525, 82)

In [20]:
x_.head()

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,service_shell,service_smtp,service_ssh,service_telnet,service_tftp_u,service_tim_i,service_time,service_urh_i,service_urp_i,service_whois
0,8,0.0,5450,9,0.0,0.0,0.11,1.0,0.0,9,...,0,0,0,0,0,0,0,0,0,0
1,8,0.0,486,19,0.0,0.0,0.05,1.0,0.0,19,...,0,0,0,0,0,0,0,0,0,0
2,8,0.0,1337,29,0.0,0.0,0.03,1.0,0.0,29,...,0,0,0,0,0,0,0,0,0,0
3,6,0.0,1337,39,0.0,0.0,0.03,1.0,0.0,39,...,0,0,0,0,0,0,0,0,0,0
4,6,0.0,2032,49,0.0,0.0,0.02,1.0,0.0,49,...,0,0,0,0,0,0,0,0,0,0


In [21]:
y.value_counts(normalize=True)

True     0.987343
False    0.012657
Name: label, dtype: float64

In [63]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, average_precision_score

In [65]:
%%time
iforest = IsolationForest(contamination=0.012657, behaviour='new')
iforest.fit(x_)

CPU times: user 5.77 s, sys: 996 ms, total: 6.76 s
Wall time: 6.77 s


In [66]:
x_predicted = iforest.predict(x_)

In [67]:
pd.Series(x_predicted).value_counts(normalize=True)

 1    0.987333
-1    0.012667
dtype: float64

In [89]:
pd.DataFrame({'predicted': x_predicted == 1, 'actual': y}).groupby(['predicted', 'actual']).size()  # / len(y)

predicted  actual
False      False        92
           True       1156
True       False      1155
           True      96122
dtype: int64

In [91]:
f1_score(y, x_predicted == 1)

0.9881216108555421

In [95]:
prev = y.value_counts(normalize=True)

In [98]:
list(prev)

[0.987343313879726, 0.012656686120274043]

In [104]:
# random labels with same prevalnce
pd.Series([
    f1_score(y, np.random.choice([1, 0], p=prev, size=len(y)) == 1)
    for _ in range(100)
]).describe()


count    100.000000
mean       0.987325
std        0.000178
min        0.986653
25%        0.987218
50%        0.987332
75%        0.987435
max        0.987653
dtype: float64

In [105]:
# random labeling, AUC score
pd.Series([
    roc_auc_score(y, np.random.choice([1, 0], p=prev, size=len(y)) == 1)
    for _ in range(100)
]).describe()

count    100.000000
mean       0.499981
std        0.001575
min        0.496598
25%        0.498883
50%        0.499844
75%        0.501125
max        0.503872
dtype: float64

In [109]:
pd.DataFrame({'predicted': np.random.choice([1, 0], p=prev, size=len(y)) == 1, 'actual': y}) \
.groupby(['predicted', 'actual']).size()  # / len(y)

predicted  actual
False      False        12
           True       1219
True       False      1235
           True      96059
dtype: int64

In [88]:
precision_recall_fscore_support(y, x_predicted == 1)

(array([0.07371795, 0.98812669]),
 array([0.07377706, 0.98811653]),
 array([0.07374749, 0.98812161]),
 array([ 1247, 97278]))

In [71]:
scores = iforest.score_samples(x_)

In [72]:
scores

array([-0.35831177, -0.34979052, -0.34499527, ..., -0.43336227,
       -0.3819366 , -0.36153826])

In [73]:
pd.Series(scores).describe()

count    98525.000000
mean        -0.377465
std          0.060215
min         -0.618794
25%         -0.416767
50%         -0.351977
75%         -0.325838
max         -0.317423
dtype: float64

In [74]:
from sklearn.preprocessing import MinMaxScaler

In [75]:
scores.reshape(-1).shape

(98525,)

In [77]:
scores_scaled = MinMaxScaler().fit_transform(scores[:, pd.np.newaxis])

In [78]:
roc_auc_score(y, scores_scaled)

0.9700965081054004

In [79]:
average_precision_score(y, scores_scaled)

0.9996123305560368