In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import mode
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNN
%matplotlib inline

import pandas as pd
import numpy as np
import os
import time

from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

In [None]:
# Directories and file names for raw and processed data
RESULTS_DIR = '../data/processed/'
TRAIN_RFILE, TEST_RFILE = RESULTS_DIR+'new_trainData.csv', RESULTS_DIR+'new_testData.csv'

DATA_DIR = '../data/raw/2008/'
TRAIN_FILES = ['0101']
# TEST_FILES = ['0710', '0725', '0810', '0825', '0910', '0923',
#               '1010', '1025', '1110', '1125', '1210', '1225']
TEST_FILES = ['0102']

# Constants relating to the dataset
LABEL_COL = 17
CATEGORICAL_COLS = {13: ['S0', 'S1', 'SF', 'REJ', 'S2', 'S3', 'RSTO',
                         'RSTR', 'RSTOS0', 'RSTRH', 'SH', 'SHR', 'OTH'],
                    17: [-1, 1]}
NUMERIC_COLS = [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 23]
USE_COLS = sorted(NUMERIC_COLS+list(CATEGORICAL_COLS.keys()))


def main():
    """
    Main function that preprocesses and saves training/testing data to seperate files.
    :return: void
    """
    preprocess(map(create_source_fpath, TRAIN_FILES), TRAIN_RFILE)
    preprocess(map(create_source_fpath, TEST_FILES), TEST_RFILE)


def encode_categorical(mat):
    """
    One hot encodes categorical features.
    :param mat: numpy matrix of shape [n_examples, n_features+1]
    :return: (X, y) where X is a numpy matrix of encoded feature vectors
        and y is numpy vector of labels corresponding to rows of X
    """
    # encode categorical columns as ints
    numeric_indices = [i for i in range(len(USE_COLS)) if USE_COLS[i] in NUMERIC_COLS]
    categorical_indices = []

    for col, values in CATEGORICAL_COLS.items():
        i = USE_COLS.index(col)
        if col != LABEL_COL:
            categorical_indices.append(i)

        # classes = np.unique(mat[:, i])
        # if len(np.intersect1d(classes, values)) < len(classes):
        #     diff = np.setdiff1d(classes, values)
        #     raise ValueError("Column {} contains new labels: {}".format(i, str(diff)))
        #
        # mat[:, i] = np.array([values.index(e) for e in mat[:, i]])

    # extract X and y
    (label_index, ) = set(range(len(USE_COLS))) - set(numeric_indices + categorical_indices)
    categorical_indices = list(map(lambda i: i-1 if i > label_index else i, categorical_indices))
    X, y = np.delete(mat, label_index, 1), mat[:, label_index]

    # one hot encode X
    # enc = OneHotEncoder(sparse=False, n_values=len(CATEGORICAL_COLS[13]),
    #                     categorical_features=categorical_indices)
    # X = enc.fit_transform(X)

    return X, y.reshape((-1, 1))


def preprocess(filepaths, results_file, use_binary_classification=True):
    """
    Takes a list of data files, preprocesses them (one hot encoding, removal of absolute duplicates),
    and saves the results to a file. Overwrites previous results file if it exists.

    :param filepaths: list of full filenames to process
    :param results_file: full filename for storing results
    :param use_binary_classification: boolean indicating whether known and unknown attacks should be combined
    :return: void
    """
    kwargs = dict(header=False, float_format='%.4f', index=False)

    if os.path.isfile(results_file):
        os.remove(results_file)

    for filepath in filepaths:
        df = pd.read_csv(filepath_or_buffer=filepath,
                         usecols=USE_COLS,
                         delim_whitespace=True,
                         header=None)
        cleaned_df = df.drop_duplicates()

        if use_binary_classification:
            bin_labels = cleaned_df.loc[:, LABEL_COL].replace({-2: -1})
            cleaned_df.loc[:, LABEL_COL] = bin_labels

        # print('class instance counts: \n{}'.format(cleaned_df.loc[:, self.LABEL_COL].value_counts()))
        mat = np.hstack(encode_categorical(cleaned_df.values))

        # shuffle data
        mat = shuffle(mat, random_state=0)

        # save to file
        df =pd.concat([
            pd.DataFrame(data=mat.astype(float)[:,:11]),
            pd.DataFrame(data=mat.astype('|S10')[:,11]),
            pd.DataFrame(data=mat.astype(float)[:, 11:])
            ], axis=1)
        if os.path.isfile(results_file):
            df.to_csv(results_file, mode='a', **kwargs)
        else:
            df.to_csv(results_file, **kwargs)


def create_source_fpath(filenum):
    return DATA_DIR + '2008' + filenum + '.txt'


if __name__ == '__main__':
    print('processing data...')
    start_time = time.time()
    main()
    total_time = time.time() - start_time
    print('finished writing data to \'{}\' and \'{}\' in {} seconds'.format(TRAIN_RFILE, TEST_RFILE, total_time))
