In [1]:
import os
import torch
import torch.nn as nn
from torch import optim
import torchvision.datasets as dset
import torchvision.transforms as transforms

import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
import numpy as np
import matplotlib
import pandas as pd

In [137]:
data_dir = "../data"
data_name = '/NSL-KDD'
path = data_dir + data_name
df_train = pd.read_csv(path+'/KDDTrain+_20Percent.txt', header=None)

In [138]:
columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'target', 'level'
]
len(columns)

43

In [139]:
path

'../data/NSL-KDD'

In [19]:
cols = dict(zip(range(len(columns)), columns))
df_train.rename(columns=cols, inplace=True)
df_train.head()

In [29]:
# 欠損値の確認
df_train.isnull().values.sum()

0

In [59]:
# target: 攻撃の種類と正常
df_train['target'].unique()
target_dict = dict(zip(df_train['target'].unique(), range(len(df_train['target'].unique()))))
target_dict

{'normal': 0,
 'neptune': 1,
 'warezclient': 2,
 'ipsweep': 3,
 'portsweep': 4,
 'teardrop': 5,
 'nmap': 6,
 'satan': 7,
 'smurf': 8,
 'pod': 9,
 'back': 10,
 'guess_passwd': 11,
 'ftp_write': 12,
 'multihop': 13,
 'rootkit': 14,
 'buffer_overflow': 15,
 'imap': 16,
 'warezmaster': 17,
 'phf': 18,
 'land': 19,
 'loadmodule': 20,
 'spy': 21}

In [96]:
train_data = pd.get_dummies(df_train.iloc[:, :-2]).values
train_target = pd.get_dummies(df_train.iloc[:, -2].map(target_dict)).values

In [102]:
def preprocess_NSL(df, columns):
    cols = dict(zip(range(len(columns)), columns))
    df.rename(columns=cols, inplace=True)
    df.head()

    target_dict = dict(zip(df['target'].unique(), range(len(df['target'].unique()))))

    data = pd.get_dummies(df.iloc[:, :-2]).values
    target = df.iloc[:, -2].map(target_dict).values

    return data, target

In [114]:
def Load_KDD(path):
    df_train = pd.read_csv(path+'/KDDTrain+_20Percent.txt', header=None)
    df_test = pd.read_csv(path+'/KDDTest+.txt', header=None)
    columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
                'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
                'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
                'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
                'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
                'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'target', 'level'
    ]

    print(columns)

    train_data, train_target = preprocess_NSL(df_train, columns)
    test_data, test_target = preprocess_NSL(df_test, columns)

    return train_data, train_target, test_data, test_target

In [115]:
td, tt, tsd, tst = Load_KDD(path)
tst

['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'target', 'level']


array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [116]:
max(tst)

21

In [119]:
dataset = torch.utils.data.TensorDataset(torch.tensor(td), torch.tensor(tt))

In [121]:
dataset[0]

(tensor([0.0000e+00, 4.9100e+02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 2.0000e+00, 2.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.5000e+02, 2.5000e+01,
         1.7000e-01, 3.0000e-02, 1.7000e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         5.0000e-02, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e