In [1]:
# Imports

import json
import os
import pickle
import numpy as np
import pandas as pd
import sklearn as sk
from matplotlib import pyplot as plt
from sklearn import tree, feature_selection, model_selection, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Constant definitions

DATA_PATH_PREFIX = os.path.join('..', 'test_data')
DATA_IN_PATH = os.path.join(DATA_PATH_PREFIX, 'dataset_1.csv')
DATA_OUT_PATH = os.path.join(DATA_PATH_PREFIX, 'model.json')
SCHEMA_IN_PATH = os.path.join(DATA_PATH_PREFIX, 'dtypes.json')
FEATURE_LIST = [
    'frame_len', 'eth_type',
    'ip_proto', 'ip_flags',
    'ipv6_nxt', 'ipv6_opt',
    'tcp_srcport', 'tcp_dstport', 'tcp_flags',
    'udp_srcport', 'udp_dstport'
]
CLASS_LIST = [
    'static',
    'sensors',
    'audio',
    'video',
    'other'
]
RF_NUM_TREES = 3
RF_TREE_DEPTH = 3
PAR_JOBS = RF_NUM_TREES

In [3]:
# Function definitions

def load_data(dataset_path, schema_path):
    '''Load and process data from csv

    Args:
      dataset_path: str - path to dataset csv
      schema_path: str - path to schema json file

    Return: (pd.dataframe, pd.series)
      Tuple containing feature table, "label" field series
    '''
    # I/O - Read data and schema
    with open(schema_path, 'r') as fis:
        dtypes = json.load(fis)
    dataset = pd.read_csv(dataset_path, dtype=dtypes)
    
    # Process - Eth type
    dataset['eth_type'] = dataset['eth_type'].map(
        lambda x: int(x, base=16) if x[:2] == '0x' else int(x)
    ).astype('uint64')

    # Process - IP Flags
    dataset['ip_flags'] = dataset['ip_flags'].map(
        lambda x: int(x, base=16) if x[:2] == '0x' else int(x)
    ).astype('uint64')
    
    # Process - TCP Flags
    dataset['tcp_flags'] = dataset['tcp_flags'].map(
        lambda x: int(x, base=16) if x[:2] == '0x' else int(x)
    ).astype('uint64')
    
    # Split - Label
    label = dataset['label']
    
    return dataset[FEATURE_LIST], label,
    

def split_data(dataframe, labels, train_percentage):
    '''Randomly partition dataset into training data and testing data

    Args:
      dataframe: np.2darray[...] - Numpy array of dataset
      labels: np.array[bool] - classifier results

    Return: (np.array, list)
      Tuple containing the mapping and a list of the conversion back
    '''
    model_selection.train_test_split()


def train_data(dataframe, cat):
    '''It trains data for you. What else is it going to do? Order a pizza?
    
    Args:
      dataframe: np.2darray[...] - the dataset
      cat: np.array[bool] - classifier results
    '''
    rf = RandomForestClassifier(n_estimators=RF_NUM_TREES, n_jobs=PAR_JOBS,
                                random_state=0, max_depth=RF_TREE_DEPTH)
    rf.fit(dataframe, cat)
    return rf


def eval_model(model, test_dataset, test_class):
    '''Prints evaluation of model by testing on dataset

    Args:
      model: RandomForestClassifier - The model to test
      test_dataset: np.2darray[...] - the test dataset
      test_class: np.array[bool] - test classifier results
    '''
    predictions = model.predict(test_dataset)
    errors = abs(predictions - test_class)
    mean_abs_err = round(np.mean(errors), 2)
    
    for i, tree_in_forest in enumerate(model.estimators_):
        print("TREE ", i)
        print(tree.export_text(tree_in_forest))

    print('\nMean Absolute error', mean_abs_err, 'degrees')


def save_data(model_path, model_json):
    '''Save a model to an external file

    Args:
      model_path: str - model output filepath
      model_json: str - the model to save
    '''
    with open(model_path, 'w+') as fos:
        fos.write(model_json)


def model2json(model, feature_list):
    '''From decision tree to json string

    Args:
      model: RandomForestClassifier - the model
      feature_list: np.array[str] - the feature_list
      class_list: np.array[...] - the classification_list

    Return: str
      json string of model
    '''
    def _tree2list(tree_obj, idx):
        lidx = tree_obj.children_left[idx]
        ridx = tree_obj.children_right[idx]
        return [
            int(tree_obj.feature[idx]),
            int(tree_obj.threshold[idx]),
            _tree2list(tree_obj, lidx),
            _tree2list(tree_obj, ridx),
        ] if lidx > 0 and ridx > 0 else int(np.argmax(tree_obj.value[idx][0]))
    return json.dumps({
        'feature_list': list(feature_list),
        'class_list': list(CLASS_LIST),
        'tree': [_tree2list(x.tree_, 0) for x in model]
    }, indent=4)

In [4]:
# Load Data

dataset, label = load_data(DATA_IN_PATH, SCHEMA_IN_PATH)

In [5]:
# Manual Data Validation: Additional Dependencies Needed: Eyes >= 1.0

print('# Dataset Types:')
print(dataset.dtypes)
print('\n# Dataset:')
dataset

# Dataset Types:
frame_len      uint64
eth_type       uint64
ip_proto       uint64
ip_flags       uint64
ipv6_nxt       uint64
ipv6_opt       uint64
tcp_srcport    uint64
tcp_dstport    uint64
tcp_flags      uint64
udp_srcport    uint64
udp_dstport    uint64
dtype: object

# Dataset:


Unnamed: 0,frame_len,eth_type,ip_proto,ip_flags,ipv6_nxt,ipv6_opt,tcp_srcport,tcp_dstport,tcp_flags,udp_srcport,udp_dstport
0,74,2048,6,2,18446744073709551615,18446744073709551615,4425,49153,2,18446744073709551615,18446744073709551615
1,66,2048,6,2,18446744073709551615,18446744073709551615,4425,49153,16,18446744073709551615,18446744073709551615
2,262,2048,6,2,18446744073709551615,18446744073709551615,4425,49153,24,18446744073709551615,18446744073709551615
3,66,2048,6,2,18446744073709551615,18446744073709551615,4425,49153,16,18446744073709551615,18446744073709551615
4,66,2048,6,2,18446744073709551615,18446744073709551615,4425,49153,17,18446744073709551615,18446744073709551615
...,...,...,...,...,...,...,...,...,...,...,...
947067,66,2048,6,2,18446744073709551615,18446744073709551615,443,46330,16,18446744073709551615,18446744073709551615
947068,54,2048,6,2,18446744073709551615,18446744073709551615,443,40184,16,18446744073709551615,18446744073709551615
947069,60,2054,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615
947070,42,2054,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615


In [6]:
# Train Data

feature_list = list(dataset.columns)
train_dataset, test_dataset, train_class, test_class = model_selection.train_test_split(
    np.array(dataset), np.array(label),
    test_size=0.25, random_state=50,
    shuffle=True)
model = train_data(train_dataset, train_class)


In [9]:
# Eval Trained Model
eval_model(model, test_dataset, test_class)

TREE  0
|--- feature_0 <= 155.50
|   |--- feature_6 <= 3067.50
|   |   |--- class: 4.0
|   |--- feature_6 >  3067.50
|   |   |--- feature_0 <= 65.50
|   |   |   |--- class: 4.0
|   |   |--- feature_0 >  65.50
|   |   |   |--- class: 4.0
|--- feature_0 >  155.50
|   |--- feature_6 <= 34568.50
|   |   |--- feature_6 <= 2970.50
|   |   |   |--- class: 4.0
|   |   |--- feature_6 >  2970.50
|   |   |   |--- class: 0.0
|   |--- feature_6 >  34568.50
|   |   |--- feature_8 <= 24.50
|   |   |   |--- class: 3.0
|   |   |--- feature_8 >  24.50
|   |   |   |--- class: 3.0

TREE  1
|--- feature_7 <= 729.00
|   |--- feature_0 <= 155.50
|   |   |--- feature_8 <= 16.50
|   |   |   |--- class: 4.0
|   |   |--- feature_8 >  16.50
|   |   |   |--- class: 4.0
|   |--- feature_0 >  155.50
|   |   |--- feature_0 <= 246.50
|   |   |   |--- class: 3.0
|   |   |--- feature_0 >  246.50
|   |   |   |--- class: 3.0
|--- feature_7 >  729.00
|   |--- feature_10 <= 4511.50
|   |   |--- feature_9 <= 56381.50
|   |  

In [10]:
# Save Data

save_data(DATA_OUT_PATH, model2json(model, feature_list))