In [1]:
#import statments
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [2]:
#read csv into a dataframe, fill NaN with -999. ML cannot predict on records containing NaN
df = pd.read_csv('output.csv').fillna(-999)

In [3]:
#feature list to make predictions on
features = ['respond_ipbytes', 'protocol', 'original_bytes', 'original_ipbytes', 'duration', 'missed_bytes', 'destination_port']

In [4]:
#split the dataframe into training and testing data
training_data = df.head(100)
testing_data = df.tail(40000)

In [5]:
#save dataframes as csv files
training_data[features].to_csv('training_data.csv')
testing_data.to_csv('testing_data.csv')

In [6]:
#open training_data.csv and label data

In [6]:
#load training csv into a dataframe
training_df = pd.read_csv('training_data.csv')

In [7]:
#create a labelencoder object and encode protocol column, ML cannot predict on strings
lc = preprocessing.LabelEncoder()
training_df['protocol'] = lc.fit_transform(training_df.protocol)

In [8]:
#create a RFC classifier to be trained
clf = RandomForestClassifier(n_jobs=2, random_state=0)

In [9]:
#train the classifier
clf.fit(training_df[features], training_df['rfc'])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [10]:
#load testing_data into a dataframe
testing_df = pd.read_csv('testing_data.csv')

In [11]:
#create a labelencoder object and encode protocol column, ML cannot predict on strings
lc = preprocessing.LabelEncoder()
testing_df['protocol'] = lc.fit_transform(testing_df.protocol)

In [12]:
#predict records in the testing_df store result in RFC_Prediction column
testing_df['RFC_Prediction'] = clf.predict(testing_df[features])

In [18]:
testing_df.loc[testing_df['RFC_Prediction'] == 1, 'RFC_Prediction' ] = 'Anomalous'
testing_df.loc[testing_df['RFC_Prediction'] == 0, 'RFC_Prediction' ] = 'Normal'

In [13]:
#review the importance of the features selected
list(zip(training_df[features], clf.feature_importances_))

[('respond_ipbytes', 0.0021786492374730117),
 ('protocol', 0.0005319000027995721),
 ('original_bytes', 0.11861879057646246),
 ('original_ipbytes', 0.3484419020133306),
 ('duration', 0.09869115996566974),
 ('missed_bytes', 0.4315375982042647),
 ('destination_port', 0.0)]

In [19]:
#save results to a csv
testing_df = testing_df.drop([testing_df.columns[0],testing_df.columns[1]], axis='columns')
testing_df.to_csv('prediction.csv')
testing_df.to_json('json-prediction.json', orient='records', lines=True)