In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
import time
import os
os.listdir()

In [None]:

def parse_attack_types(filename):
    attack_map = {}
    attack_encoding = {}
    count = 0
    with open(filename) as f:
        lines = f.readlines()
    for line in lines:
        if len(list(line.split()))!=0:
            list_attack=list(line.split())
            attack=list_attack[0]
            category = list_attack[1]
            if attack not in attack_map:
                attack_map[attack] = {
                    'encoding': count,
                    'category': category
                }
                count += 1
    return attack_map


def encode_data(train_data, cols):
    encodings = {}
    for col in cols:
        unique_values = train_data[col].unique()
        mapping = {}
        reverse_mapping = {}
        for j, value in enumerate(unique_values):
            mapping[value] = j
            reverse_mapping[j] = value
        train_data[col] = train_data[col].map(mapping)
        encodings[col] = reverse_mapping
    return encodings


def parse_data(filename):
    return pd.read_csv(filename, header=None)

In [None]:
print('Running project')
attack_map = parse_attack_types('../input/kdd-cup-1999-data/training_attack_types')
print('Attack mapping:')
print(attack_map)
train_data = parse_data('../input/kdd-cup-1999-data/kddcup.data_10_percent.gz')
print('Raw data:')
print(train_data[:2])
encodings = encode_data(train_data, (1, 2, 3))
print('Encoded data:')
print(train_data[:2])
print('Encodings:')
print(encodings)

In [None]:
# def revised_attack_mapping(attack_map):
#     revised_attack_map = {}
#     for name, value in attack_map.items():
#         revised_attack_map[name + "."] = (value['category'])
#     return revised_attack_map

def attack_category_encoding():
    attack_category_map = {}
    attack_category_map ['normal'] = 0
    attack_category_map ['dos'] = 1
    attack_category_map ['probe'] = 2
    attack_category_map ['r2l'] = 3
    attack_category_map ['u2r'] = 4
    return attack_category_map

category_attack_map = revised_attack_mapping(attack_map)
category_attack_map['normal.'] = "normal" 

In [None]:
category_attack_map

In [None]:
attack_category_map = attack_category_encoding()
attack_category_map

In [None]:
train_data[41] = train_data[41].map(category_attack_map)
train_data[41].value_counts().plot(kind='bar')
train_data[41].value_counts()

In [None]:
def test_classifier(clf):
    start = time.time()
    clf = clf.fit(X_train, y_train)
    training_ends = time.time()
    prediction = clf.predict(X_test)
    prediction_ends = time.time()
    result = (metrics.classification_report(y_test, prediction, output_dict = True))
    training_time = training_ends - start
    testing_time = prediction_ends - training_ends
    print (metrics.classification_report(y_test, prediction))
    acc = metrics.accuracy_score(y_test, prediction)
    print ("Accuracy Score: %s" % acc)
    print ("Classifier Training time = %s" % training_time)
    print ("Classifier Prediction time = %s" % testing_time)
    train_time.append(training_time)
    test_time.append(testing_time)
    accuracy.append(acc)
    return clf, result


In [None]:
X = train_data.drop(columns=[41])
y = train_data[[41]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)

In [None]:
weight={}
weight['dos'] = 1
weight['normal'] = 1
weight['probe'] = 1
weight['r2l'] = 1
weight['u2r'] = 102
clf_DecisionTree_final = DecisionTreeClassifier(criterion = 'entropy', class_weight = weight)
clf_DecisionTree_final, treereport_final = test_classifier(clf_DecisionTree_final)

In [None]:
import pickle
pickle.dump(clf_DecisionTree_final, open("DT_model.pkl","wb"))