In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

In [2]:
data_path="./data/kdd.corrected"
col_names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
             "num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
             "is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
             "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate",
             "dst_host_srv_rerror_rate","label"]

In [3]:
df=pd.read_csv(data_path,header=None,names=col_names)
pct_anomalies=0.01

In [4]:
''' Randomly reduce the abnormal data to 1% of normal data
In this function, we try to make the abnormal data pct_anomalies=0.01 i.e. 1% of normal data
So we randomly pick, all the anomalies and only keep anomalies such that they are 1% of normal data
The ratio can be changed with pct_anomalies variable
'''

def reduce_anomalies(df, pct_anomalies=.01):
    labels = df['label'].copy()
    is_anomaly = labels != 'normal.'
    num_normal = np.sum(~is_anomaly)
    num_anomalies = int(pct_anomalies * num_normal)
    all_anomalies = labels[labels != 'normal.']
    anomalies_to_keep = np.random.choice(all_anomalies.index, size=num_anomalies, replace=False)
    anomalous_data = df.iloc[anomalies_to_keep].copy()
    normal_data = df[~is_anomaly].copy()
    new_df = pd.concat([normal_data, anomalous_data], axis=0)
    return new_df

new_df = reduce_anomalies(df, pct_anomalies=pct_anomalies)

In [5]:
# capture the categorical variables and one-hot encode them
cat_vars = ['protocol_type', 'service', 'flag', 'land', 'logged_in','is_host_login', 'is_guest_login']

# find unique labels for each category
# protocol_type has 5 unique values: udp, tcp, and, icmp
# Therefore, each category will have a unique label: protocol_type_icmp, protocol_type_tcp, and, protocol_type_udp
cat_data = pd.get_dummies(new_df[cat_vars])
cat_data

Unnamed: 0,land,logged_in,is_host_login,is_guest_login,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_auth,...,service_urp_i,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF
0,0,0,0,0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,0,0,0,0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,0,0,0,0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
6,0,0,0,0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7,0,0,0,0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172984,0,0,0,0,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
264837,0,0,0,0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
186253,0,0,0,0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
245324,0,0,0,0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [6]:
#Remove the categorical data and labels from the dataframe
not_cat_array = list(set(new_df.columns.values.tolist()) - set(cat_vars))
not_cat_array.remove('label')
not_cat_data = new_df[not_cat_array].copy()
not_cat_data

Unnamed: 0,duration,src_bytes,dst_host_same_srv_rate,root_shell,dst_host_srv_diff_host_rate,hot,num_access_files,num_shells,wrong_fragment,num_file_creations,...,srv_diff_host_rate,diff_srv_rate,dst_host_rerror_rate,count,dst_host_srv_rerror_rate,srv_serror_rate,num_failed_logins,rerror_rate,dst_bytes,dst_host_same_src_port_rate
0,0,105,1.00,0,0.0,0,0,0,0,0,...,0.0,0.00,0.0,1,0.0,0.0,0,0.0,146,0.0
1,0,105,1.00,0,0.0,0,0,0,0,0,...,0.0,0.00,0.0,1,0.0,0.0,0,0.0,146,0.0
2,0,105,1.00,0,0.0,0,0,0,0,0,...,0.0,0.00,0.0,1,0.0,0.0,0,0.0,146,0.0
6,0,29,0.30,0,0.0,0,0,0,0,0,...,0.0,1.00,0.0,2,0.0,0.0,0,0.0,0,0.3
7,0,105,0.99,0,0.0,0,0,0,0,0,...,0.0,0.00,0.0,1,0.0,0.0,0,0.0,146,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172984,0,0,0.00,0,0.0,0,0,0,0,0,...,0.0,0.07,1.0,101,1.0,0.0,0,1.0,0,0.0
264837,0,520,1.00,0,0.0,0,0,0,0,0,...,0.0,0.00,0.0,511,0.0,0.0,0,0.0,0,1.0
186253,0,1032,1.00,0,0.0,0,0,0,0,0,...,0.0,0.00,0.0,126,0.0,0.0,0,0.0,0,1.0
245324,0,520,1.00,0,0.0,0,0,0,0,0,...,0.0,0.00,0.0,511,0.0,0.0,0,0.0,0,1.0


In [7]:
all_data = pd.concat([not_cat_data, cat_data], axis=1)
all_data

Unnamed: 0,duration,src_bytes,dst_host_same_srv_rate,root_shell,dst_host_srv_diff_host_rate,hot,num_access_files,num_shells,wrong_fragment,num_file_creations,...,service_urp_i,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF
0,0,105,1.00,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1,0,105,1.00,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
2,0,105,1.00,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
6,0,29,0.30,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
7,0,105,0.99,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172984,0,0,0.00,0,0.0,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,False
264837,0,520,1.00,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
186253,0,1032,1.00,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
245324,0,520,1.00,0,0.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True


In [13]:
# Encode labels
# There are 16 different labels that are strings, each unique label is associated with a unique integer
le=LabelEncoder()
le.fit(df['label'])
encoded_labels = le.transform(new_df['label'])

In [14]:
# Split the data into test and training data (25% test and 75% training set)
x_train,x_test,y_train,y_test= train_test_split(all_data,encoded_labels,test_size=0.25,random_state=42)

In [15]:
# save the datasets for later use
# I am using pickle so that I can access the data later
# The alternatives are Pandas/Numpy but they are only for Dataframe/NumericValues. Picke allows to store objects
preprocessed_data = {
    'x_train':x_train,
    'y_train':y_train,
    'x_test':x_test,
    'y_test':y_test,
    'le':le
}

# pickle the preprocessed_data
path = 'preprocessed_data_full.pkl'
out = open(path, 'wb')
pickle.dump(preprocessed_data, out)
out.close()