In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Statistics").getOrCreate()
sc = spark.sparkContext

In [9]:
import numpy as np
def parse_interaction(line):
    line_split = line.split(",")
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i, item in enumerate(line_split) if i not in symbolic_indexes]
    return np.array([float(x) for x in clean_line_split])

raw_data = sc.textFile("kddcup.data_10_percent.gz")
vector_data = raw_data.map(parse_interaction)
vector_data.take(1)

[array([0.00e+00, 1.81e+02, 5.45e+03, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 9.00e+00, 9.00e+00,
        1.00e+00, 0.00e+00, 1.10e-01, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00])]

In [10]:
def printStatistics(summary):
    print("Duration Statistics")
    print(" Mean: {}".format(round(summary.mean()[0], 3)))
    print(" St. deviation: {}".format(round(sqrt(summary.variance()[0]), 3)))
    print(" Max value: {}".format(round(summary.max()[0], 3)))
    print(" Min value: {}".format(round(summary.min()[0], 3)))
    print(" Total value count: {}".format(summary.count()))
    print(" Number of non-zero values: {}".format(summary.numNonzeros()[0]))

In [11]:
from pyspark.mllib.stat import Statistics
from math import sqrt

summary = Statistics.colStats(vector_data)
printStatistics(summary)

Duration Statistics
 Mean: 47.979
 St. deviation: 707.746
 Max value: 58329.0
 Min value: 0.0
 Total value count: 494021
 Number of non-zero values: 12350.0


In [12]:
def parse_interaction_with_key(line):
    line_split = line.split(",")
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i, item in enumerate(line_split) if i not in symbolic_indexes]
    return (line_split[-1], np.array([float(x) for x in clean_line_split]))

label_vector_data = raw_data.map(parse_interaction_with_key)
label_vector_data.take(1)

[('normal.', array([0.00e+00, 1.81e+02, 5.45e+03, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 8.00e+00, 8.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 9.00e+00, 9.00e+00,
         1.00e+00, 0.00e+00, 1.10e-01, 0.00e+00, 0.00e+00, 0.00e+00,
         0.00e+00, 0.00e+00]))]

In [13]:
normal_label_data = label_vector_data.filter(lambda r: r[0] == "normal.").values()
normal_summary = Statistics.colStats(normal_label_data)
printStatistics(normal_summary)

Duration Statistics
 Mean: 216.657
 St. deviation: 1359.213
 Max value: 58329.0
 Min value: 0.0
 Total value count: 97278
 Number of non-zero values: 11690.0


In [14]:
def summary_by_label(raw_data, label):
    label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda r: r[0] == label).map(lambda r: r[1])
    label_summary = Statistics.colStats(label_vector_data)
    return label_summary

label_summary = summary_by_label(raw_data, "normal.")
printStatistics(label_summary)

Duration Statistics
 Mean: 216.657
 St. deviation: 1359.213
 Max value: 58329.0
 Min value: 0.0
 Total value count: 97278
 Number of non-zero values: 11690.0


In [15]:
label_list = ["back.", "normal.", "imap.", "ftp_write."]

stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]
print(stats_by_label)

[('back.', <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary object at 0x00000000060E3EB8>), ('normal.', <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary object at 0x00000000060E3BA8>), ('imap.', <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary object at 0x00000000060EE978>), ('ftp_write.', <pyspark.mllib.stat._statistics.MultivariateStatisticalSummary object at 0x00000000060EE470>)]


In [16]:
duration_by_label = [
    (label, float(summary.mean()[0]), sqrt(summary.variance()[0]), float(summary.max()[0]), float(summary.min()[0]), summary.count())
    for label, summary in stats_by_label
]
print(duration_by_label)

[('back.', 0.1289151157512483, 1.1100621667887005, 14.0, 0.0, 2203), ('normal.', 216.65732231336938, 1359.213468917662, 58329.0, 0.0, 97278), ('imap.', 6.0, 14.174240399721281, 41.0, 0.0, 12), ('ftp_write.', 32.375, 47.44903280664121, 134.0, 0.0, 8)]


In [17]:
duration_by_label_rdd = sc.parallelize(duration_by_label)
duration_by_label_df = spark.createDataFrame(duration_by_label_rdd,
                                            ["label", "Mean", "Stdev","Min", "Max", "Count"])
duration_by_label_df.show()

+----------+------------------+------------------+-------+---+-----+
|     label|              Mean|             Stdev|    Min|Max|Count|
+----------+------------------+------------------+-------+---+-----+
|     back.|0.1289151157512483|1.1100621667887005|   14.0|0.0| 2203|
|   normal.|216.65732231336938| 1359.213468917662|58329.0|0.0|97278|
|     imap.|               6.0|14.174240399721281|   41.0|0.0|   12|
|ftp_write.|            32.375| 47.44903280664121|  134.0|0.0|    8|
+----------+------------------+------------------+-------+---+-----+

