In [1]:
import pandas as pd
from tensorflow.keras.utils import get_file
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

def getData(file_path):
    try:
        path = get_file(file_path, origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')

    except:
        print('Error Downloading')
        raise

    df = pd.read_csv(path, header = None)
    df.dropna(inplace = True, axis = 1)

    
    df.columns = [
        'duration',
        'protocol_type',
        'service',
        'flag',
        'src_bytes',
        'dst_bytes',
        'land',
        'wrong_fragment',
        'urgent',
        'hot',
        'num_failed_logins',
        'logged_in',
        'num_compromised',
        'root_shell',
        'su_attempted',
        'num_root',
        'num_file_creations',
        'num_shells',
        'num_access_files',
        'num_outbound_cmds',
        'is_host_login',
        'is_guest_login',
        'count',
        'srv_count',
        'serror_rate',
        'srv_serror_rate',
        'rerror_rate',
        'srv_rerror_rate',
        'same_srv_rate',
        'diff_srv_rate',
        'srv_diff_host_rate',
        'dst_host_count',
        'dst_host_srv_count',
        'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate',
        'dst_host_serror_rate',
        'dst_host_srv_serror_rate',
        'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate',
        'outcome'
    ]
    return df


def Preprocessing(df, cat_col_idx):
    df_columns = df.columns.tolist()
        
    numerical_columns = np.delete(df_columns, cat_col_idx)
   
    std = StandardScaler()
    
    for col in numerical_columns:
        df[col] = std.fit_transform(df[[col]])
    
    def encode_text_dummy(df, name):
        dummies = pd.get_dummies(df[name])
        
        for x in dummies.columns:
            dummy_name = "{}-{}".format(name, x)
            df[dummy_name] = dummies[x]
        
        df.drop(name, axis = 1, inplace = True)
    
    encode_text_dummy(df, 'protocol_type')
    encode_text_dummy(df, 'service')
    encode_text_dummy(df, 'flag')
    encode_text_dummy(df, 'logged_in')
    encode_text_dummy(df, 'is_host_login')
    encode_text_dummy(df, 'is_guest_login')
    
    df.dropna(inplace = True, axis = 1)
    
    return df
        
def SplitData(df, testsize = None, seed = None):
    if testsize == None:
        raise AssertionError("Testsize must be defined.")
    normal = df['outcome'] == 'normal.'
    attack = df['outcome'] != 'normal.'
    
    df.drop(columns = 'outcome', inplace = True)
    
    df_normal = df[normal]
    df_attack = df[attack]
    
    x_normal = df_normal.values
    x_attack = df_attack.values
    
    x_normal_train, x_normal_test = train_test_split(x_normal, test_size = testsize, random_state = seed)
    
    return x_normal_train, x_normal_test, x_attack

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [8]:
from sklearn.ensemble import IsolationForest

In [9]:
df = getData('kddcup.data_10_percent.gz')
df = Preprocessing(df, [1,2,3,6,11,20,21,41])

In [24]:
from sklearn import metrics

In [71]:
class SimpleIsolationForest:
    def __init__(self, df):
        self.df = df
        
    def Modeling(self, train_data, seed):
        self.train_data = train_data
        self.seed = seed
        
        model = IsolationForest(random_state = self.seed).fit(self.train_data)
        
        self.model = model
    
    def Prediction(self, test_data, data_type):
        self.test_data = test_data
        
        def ConvertLabel(x):
            if x == -1:
                return 1
    
            else:
                return 0
            
        function = np.vectorize(ConvertLabel)
            
        if data_type == None:
            raise AssertionError('Data Type must be defined.')
            
        elif data_type == 'Insample':
            pred = self.model.predict(self.test_data)
            pred = function(pred)
            pred = list(pred)
            
            print('Insample Classification Result \n')
            print('Normal Value: {}'.format(pred.count(0)))
            print('Anomlay Value: {}'.format(pred.count(1)))

        elif data_type == 'OutOfSample':
            pred = self.model.predict(self.test_data)
            pred = function(pred)
            pred = list(pred)
            
            print('Insample Classification Result \n')
            print('Normal Value: {}'.format(pred.count(0)))
            print('Anomlay Value: {}'.format(pred.count(1)))
            
        elif data_type == 'Attack':
            pred = self.model.predict(self.test_data)
            pred = function(pred)
            pred = list(pred)
            
            print('Insample Classification Result \n')
            print('Normal Value: {}'.format(pred.count(0)))
            print('Anomlay Value: {}'.format(pred.count(1)))
            
            self.pred = pred
            
            return self.pred

In [73]:
tmp = SimpleIsolationForest(X_normal_train)

In [74]:
tmp.Modeling(X_normal_train, seed = 42)

In [75]:
tmp.Prediction(X_normal_test, data_type = 'OutOfSample')

Insample Classification Result 

Normal Value: 21946
Anomlay Value: 2374


In [76]:
tmp.Prediction(X_normal_train, data_type = 'Insample')

Insample Classification Result 

Normal Value: 65667
Anomlay Value: 7291


In [None]:
tmp.Prediction(X_attack, data_type = 'Attack')