In [34]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
data_path = 'Data/NSL-KDD/KDDTrain.csv'

In [10]:
df = pd.read_csv(data_path)
print(df.shape)

(22544, 42)


In [37]:
df = pd.read_csv(data_path)
# create dummy variables(use only once to modify the dataset)
def create_data_pca(df,target):
    df_change = df[['protocol_type', 'service','flag']]
    df = df.drop(labels = ['protocol_type', 'service','flag'], axis = 1)
    df_change = pd.get_dummies(df_change, prefix=['protocol_type', 'service','flag'])
    df = df.merge(df_change,left_index=True, right_index=True)
    y = df[target].values
    X = df.drop(labels = target, axis = 1)
    pca = PCA(n_components=20)
    X = pca.fit_transform(X)
    df = pd.DataFrame(X)
    df['defects'] = y
    df.to_csv('Data/NSL-KDD/modified/kddcup10_dummied.csv', index = False)
    return df
df = create_data_pca(df,'defects')

In [35]:
df = pd.read_csv(data_path)
# create dummy variables(use only once to modify the dataset)
def create_data(df,target):
    df_change = df[['protocol_type', 'service','flag']]
    df = df.drop(labels = ['protocol_type', 'service','flag'], axis = 1)
    df_change = pd.get_dummies(df_change, prefix=['protocol_type', 'service','flag'])
    df = df.merge(df_change,left_index=True, right_index=True)
    df.to_csv('Data/NSL-KDD/modified/kddcup10_dummied.csv', index = False)
    return df
df = create_data(df,'defects')

In [None]:
df = pd.read_csv(data_path)
# create dummy variables(use only once to modify the dataset)
def create_data_normalized(df,target):
    df_change = df[['protocol_type', 'service','flag']]
    df = df.drop(labels = ['protocol_type', 'service','flag'], axis = 1)
    df_change = pd.get_dummies(df_change, prefix=['protocol_type', 'service','flag'])
    df = df.merge(df_change,left_index=True, right_index=True)
    columns = df.columns
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    min_max_scaler = preprocessing.StandardScaler()
    np_scaled = min_max_scaler.fit_transform(X)
    data = pd.DataFrame(np_scaled, columns = columns[:-1])
    df = data.assign(defects = y.values.tolist())
    df.to_csv('Data/NSL-KDD/modified/kddcup10_dummied_normalized.csv', index = False)
    return df
df = create_data_normalized(df,'defects')

In [None]:
targets = df.defects.unique()

In [38]:
# Get class wise dataset
targets = df.defects.unique()
for target in targets:
    sub_data_path = 'Data/NSL-KDD/modified/kddcup10_' + target.split('.')[0] + '.csv'
    sub_df = df[df['defects'] == target].reset_index(drop = True)
    sub_df.to_csv(sub_data_path, index = False)

In [33]:
# Create training set with 2 classes
class1_path = 'Data/NSL-KDD/modified/kddcup10_normal.csv'
df_class1 = pd.read_csv(class1_path)
class2_path = 'Data/NSL-KDD/modified/kddcup10_neptune.csv'
df_class2 = pd.read_csv(class2_path)
train_df = pd.concat([df_class1, df_class2], ignore_index=True)
train_df_path = 'Data/NSL-KDD/modified/train.csv'
train_df.to_csv(train_df_path, index = False)

In [8]:
# Create training set with 1 classes
class1_path = 'Data/NSL-KDD/modified/kddcup10_normal.csv'
df_class1 = pd.read_csv(class1_path)
train_df = df_class1
train_df_path = 'Data/NSL-KDD/modified/train.csv'
train_df.to_csv(train_df_path, index = False)

In [39]:
# Create training set with all classes
target = 'defects'
y = df[target]
X = df.drop(labels = target, axis = 1)
#train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.50, random_state=42)
train_df = X
train_df[target] = y
train_df_path = 'Data/NSL-KDD/modified/train.csv'
train_df.to_csv(train_df_path, index = False)

In [None]:
# Create Test Dataset without normal
test_df = pd.DataFrame([])
for target in targets:
    sub_data_path = 'Data/NSL-KDD/modified/kddcup10_' + target.split('.')[0] + '.csv'
    if target.split('.')[0] in ['normal','neptune']: #'neptune','smurf'
        sub_df = df[df['defects'] == target].reset_index(drop = True)
        test_df = pd.concat([test_df,sub_df], ignore_index=True)
test_df_path = 'Data/NSL-KDD/modified/test.csv'
test_df.to_csv(test_df_path, index = False)

In [13]:
# Create Test Dataset with normal
test_df = pd.DataFrame([])
for target in targets:
    sub_data_path = 'Data/NSL-KDD/modified/test_kddcup10_' + target.split('.')[0] + '.csv'
    sub_df = df[df['defects'] == target].reset_index(drop = True)
    test_df = pd.concat([test_df,sub_df], ignore_index=True)
test_df_path = 'Data/NSL-KDD/modified/test.csv'
test_df.to_csv(test_df_path, index = False)

In [5]:
train_df.groupby('defects').count()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
defects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
normal,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449,13449


In [None]:
test_df.shape

In [14]:
test_df.groupby('defects').count()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
defects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
apache2,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737
back,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359
buffer_overflow,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
ftp_write,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
guess_passwd,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231,1231
httptunnel,133,133,133,133,133,133,133,133,133,133,133,133,133,133,133,133,133,133,133,133
imap,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
ipsweep,141,141,141,141,141,141,141,141,141,141,141,141,141,141,141,141,141,141,141,141
land,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
loadmodule,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
