In [1]:
import pandas as pd
import numpy as np
import urllib
import requests

In [2]:
headers_link = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names'

In [3]:
contents = requests.get(headers_link).text
headers = [x.split(':')[0] for x in contents.split('\n')[1:]]
headers[-1] = 'label'

In [4]:
df = pd.read_csv('resources/datasets/kddcup.data_10_percent', header=None, names=headers)

In [5]:
from pandas.core.frame import DataFrame

In [6]:
df.shape

(494021, 42)

In [7]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [8]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [9]:
df.protocol_type.value_counts()

icmp    283602
tcp     190065
udp      20354
Name: protocol_type, dtype: int64

In [10]:
df.shape

(494021, 42)

In [11]:
df.label.value_counts(normalize=True)

smurf.              0.568377
neptune.            0.216997
normal.             0.196911
back.               0.004459
satan.              0.003216
ipsweep.            0.002524
portsweep.          0.002105
warezclient.        0.002065
teardrop.           0.001982
pod.                0.000534
nmap.               0.000468
guess_passwd.       0.000107
buffer_overflow.    0.000061
land.               0.000043
warezmaster.        0.000040
imap.               0.000024
rootkit.            0.000020
loadmodule.         0.000018
ftp_write.          0.000016
multihop.           0.000014
phf.                0.000008
perl.               0.000006
spy.                0.000004
Name: label, dtype: float64

In [12]:
df_ = df[df.label.isin(['normal.', 'ipsweep.'])]
df_ = df_.reset_index(drop=True)

In [13]:
df_.label.value_counts(normalize=True)

normal.     0.987343
ipsweep.    0.012657
Name: label, dtype: float64

In [14]:
df_.shape

(98525, 42)

In [15]:
x = df_[df.columns.difference(['label'])]
y = df_.label == 'normal.'

In [16]:
categorical_cols = [col for col in x.columns if df.dtypes[col] == object]

In [17]:
x[categorical_cols].nunique()

flag              9
protocol_type     3
service          32
dtype: int64

In [18]:
y.nunique()

2

In [19]:
x_ = pd.get_dummies(x, columns=categorical_cols)

In [20]:
x_.shape

(98525, 82)

In [21]:
x_.head()

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,service_shell,service_smtp,service_ssh,service_telnet,service_tftp_u,service_tim_i,service_time,service_urh_i,service_urp_i,service_whois
0,8,0.0,5450,9,0.0,0.0,0.11,1.0,0.0,9,...,0,0,0,0,0,0,0,0,0,0
1,8,0.0,486,19,0.0,0.0,0.05,1.0,0.0,19,...,0,0,0,0,0,0,0,0,0,0
2,8,0.0,1337,29,0.0,0.0,0.03,1.0,0.0,29,...,0,0,0,0,0,0,0,0,0,0
3,6,0.0,1337,39,0.0,0.0,0.03,1.0,0.0,39,...,0,0,0,0,0,0,0,0,0,0
4,6,0.0,2032,49,0.0,0.0,0.02,1.0,0.0,49,...,0,0,0,0,0,0,0,0,0,0


In [22]:
y.value_counts(normalize=True)

True     0.987343
False    0.012657
Name: label, dtype: float64

In [23]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, average_precision_score

In [24]:
%%time
iforest = IsolationForest(contamination=0.012657, behaviour='new')
iforest.fit(x_)

CPU times: user 7.27 s, sys: 1.63 s, total: 8.91 s
Wall time: 8.37 s


In [44]:
x_predicted = iforest.predict(x_)

In [47]:
pd.Series(x_predicted).value_counts(normalize=True)

 1    0.987333
-1    0.012667
dtype: float64

In [55]:
q = pd.DataFrame({'predicted': x_predicted == 1, 'actual': y}).groupby(['predicted', 'actual']).size()  # / len(y)
print(pd.DataFrame({'counts': q, 'pct': q / q.sum()}))

                  counts       pct
predicted actual                  
False     False       87  0.000883
          True      1161  0.011784
True      False     1160  0.011774
          True     96117  0.975560


In [28]:
f1_score(y, x_predicted == 1)

0.9880702115083139

In [59]:
prev = y.value_counts(normalize=True)

In [60]:
prev

True     0.987343
False    0.012657
Name: label, dtype: float64

In [30]:
list(prev)

[0.987343313879726, 0.012656686120274043]

In [61]:
{1: 2}.items()

dict_items([(1, 2)])

In [56]:
# random labels with same prevalnce
pd.Series([
    f1_score(y, np.random.choice([1, 0], p=prev, size=len(y)) == 1)
    for _ in range(100)
]).median()


0.9873585576498685

In [32]:
# random labeling, AUC score
pd.Series([
    roc_auc_score(y, np.random.choice([1, 0], p=prev, size=len(y)) == 1)
    for _ in range(100)
]).describe()

count    100.000000
mean       0.499917
std        0.001505
min        0.496942
25%        0.498901
50%        0.499793
75%        0.500714
max        0.503928
dtype: float64

In [33]:
pd.DataFrame({'predicted': np.random.choice([1, 0], p=prev, size=len(y)) == 1, 'actual': y}) \
.groupby(['predicted', 'actual']).size()  # / len(y)

predicted  actual
False      False        16
           True       1203
True       False      1231
           True      96075
dtype: int64

In [34]:
precision_recall_fscore_support(y, x_predicted == 1)

(array([0.06971154, 0.98807529]),
 array([0.06976744, 0.98806513]),
 array([0.06973948, 0.98807021]),
 array([ 1247, 97278]))

In [35]:
scores = iforest.score_samples(x_)

In [36]:
scores

array([-0.35491352, -0.34993077, -0.34602508, ..., -0.43765638,
       -0.37153502, -0.35918907])

In [37]:
pd.Series(scores).describe()

count    98525.000000
mean        -0.378228
std          0.056383
min         -0.613645
25%         -0.411149
50%         -0.355956
75%         -0.330370
max         -0.319854
dtype: float64

In [38]:
from sklearn.preprocessing import MinMaxScaler

In [39]:
scores.reshape(-1).shape

(98525,)

In [40]:
scores_scaled = MinMaxScaler().fit_transform(scores[:, pd.np.newaxis])

In [41]:
roc_auc_score(y, scores_scaled)

0.94969896542178

In [42]:
average_precision_score(y, scores_scaled)

0.9993424261231967

In [63]:
pd.DataFrame({'x': {'a': 1}, 'y': {'a': 2}}).transpose()

Unnamed: 0,a
x,1
y,2
