In [1]:
####
#
# Random Forest Model - All files
# Use Kappa value instead of Accuracy due to Selection Bias in the dataset.
# 'capture20110816-2.binetflow' evidently shows the need for using Kappa metric instead of Accuracy.
#
####

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

binet = ''

def CategoryToDouble(col_stringToDouble):
    global binet
    for x in col_stringToDouble:
        indexer = StringIndexer(inputCol=x, outputCol=x+"_indexed")
        fit_model = indexer.fit(binet)
        binet = fit_model.transform(binet).drop(x)


def predict_file(file_name):
    print (file_name)
    global binet
    binet = sqlContext.read.format('csv').load('/input/p/'+file_name, header = True)
    binet = binet.drop("StartTime").drop("SrcAddr").drop("Sport").drop("DstAddr").drop("sTos").drop("dTos").drop("sTos").drop("State")
    binet = binet.withColumn('BytesPerSec', binet.TotBytes/binet.Dur).\
        withColumn("SrcByTotalBytes", binet.SrcBytes/binet.TotBytes).\
        withColumn("Dur", binet.Dur.cast('float')).\
        withColumn("Dport", binet.Dport.cast('int')).\
        withColumn("TotPkts", binet.TotPkts.cast('float')).\
        withColumn("SrcBytes", binet.SrcBytes.cast('float')).\
        drop("TotBytes")

    binet = binet.dropna(subset="Dport")
    binet = binet.fillna(0, subset="BytesPerSec")

    binet_background = binet.where(binet.Label.like("%Background%")).withColumn("Label", lit(2.0))
    binet_normal = binet.where(binet.Label.like("%Normal%")).withColumn("Label", lit(1.0))
    binet_botnet = binet.where(binet.Label.like("%Botnet%")).withColumn("Label", lit(0.0))

    binet = binet_normal.union(binet_botnet)
    binet = binet.union(binet_background)

    col_stringToDouble = ["Proto","Dir"]
    CategoryToDouble(col_stringToDouble)

    binet_background = binet.where(binet.Label == lit(2.0))
    binet_normal = binet.where(binet.Label == lit(1.0))
    binet_botnet = binet.where(binet.Label == lit(0.0))

    binet_train = binet_normal.union(binet_botnet)
    binet_test = binet_background.drop("Label")

    normal = binet_normal.count()
    botnet = binet_botnet.count()
    total = binet_train.count()
    print("\t------- Train Data -------")
    print("\tNormal -> " + str(normal) + " - " + "{0:.2f}%".format(normal/total*100))
    print("\tBotnet -> " + str(botnet) + " - " + "{0:.2f}%".format(botnet/total*100))
    print("\tTotal -> " + str(total))

    del(binet_background)
    del(binet_botnet)
    del(binet_normal)

    colnames = binet_train.columns
    colnames.remove("Label")
    
    vecAssembler = VectorAssembler(inputCols=colnames, outputCol="features")
    rf = RandomForestClassifier(labelCol="Label", featuresCol="features", numTrees=100, maxDepth=4)
    pipeline = Pipeline(stages=[rf])

    (trainingData, testData) = binet_train.randomSplit([0.618, 0.382], seed=20)
    testData = vecAssembler.transform(testData)
    trainingData = vecAssembler.transform(trainingData)
    model = pipeline.fit(trainingData)
    predictions = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(labelCol="Label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    print("\t------- Model -------")

    tp = predictions[(predictions.prediction == 1) & (testData.Label == 1)].count()
    tn = predictions[(predictions.prediction == 0) & (testData.Label == 0)].count()
    fp = predictions[(predictions.prediction == 1) & (testData.Label == 0)].count()
    fn = predictions[(predictions.prediction == 0) & (testData.Label == 1)].count()
    
    total_sum=tp+fp+tn+fn
    marg0 = (tp+fp)*(tp+fn)/total_sum
    marg1 = (tn+fp)*(tn+fn)/total_sum
    po = (tp+tn)/total_sum
    pe = (marg0+marg1)/total_sum
    kappa = (po-pe)/(1-pe)
    
    print("\ttp -> "+str(tp))
    print("\ttn -> "+str(tn))
    print("\tfp -> "+str(fp))
    print("\tfn -> "+str(fn))
    
    print("\tAccuracy = " + "{0:.2f}%\n".format(accuracy*100))
    print("\tKappa = " + "{0:.2f}%\n".format(kappa*100))

file_names = ["capture20110811.binetflow", "capture20110815-2.binetflow", "capture20110815.binetflow", "capture20110816-2.binetflow", "capture20110816.binetflow"]
for file in file_names:
    predict_file(file)

capture20110811.binetflow
	------- Train Data -------
	Normal -> 9088 - 30.26%
	Botnet -> 20941 - 69.74%
	Total -> 30029
	------- Model -------
	tp -> 3068
	tn -> 7929
	fp -> 190
	fn -> 410
	Accuracy = 94.83%

	Kappa = 87.45%

capture20110815-2.binetflow
	------- Train Data -------
	Normal -> 4676 - 83.84%
	Botnet -> 901 - 16.16%
	Total -> 5577
	------- Model -------
	tp -> 1771
	tn -> 301
	fp -> 64
	fn -> 7
	Accuracy = 96.69%

	Kappa = 87.50%

capture20110815.binetflow
	------- Train Data -------
	Normal -> 25237 - 95.18%
	Botnet -> 1277 - 4.82%
	Total -> 26514
	------- Model -------
	tp -> 9737
	tn -> 452
	fp -> 48
	fn -> 2
	Accuracy = 99.51%

	Kappa = 94.50%

capture20110816-2.binetflow
	------- Train Data -------
	Normal -> 1676 - 96.38%
	Botnet -> 63 - 3.62%
	Total -> 1739
	------- Model -------
	tp -> 630
	tn -> 1
	fp -> 26
	fn -> 0
	Accuracy = 96.04%

	Kappa = 6.87%

capture20110816.binetflow
	------- Train Data -------
	Normal -> 7480 - 61.77%
	Botnet -> 4630 - 38.23%
	Total ->