In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.functions import desc, col, window

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType
from pyspark.streaming import StreamingContext

import json
import time
import os

In [2]:
inputPath = "/home/ds/notebooks/lanl/day03/json/"

numFiles = len(os.listdir(inputPath))
numFileOffset = numFiles - 1
print(numFileOffset)

APP_NAME = "Web Server Hypothesis Test"
SPARK_URL = "local[*]"

spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()

49


In [3]:
flowSchema = StructType([
    StructField('time', TimestampType(), True),
    StructField('duration', LongType(), True),
    StructField('srcdevice', StringType(), True),
    StructField('dstdevice', StringType(), True),
    StructField('protocol', LongType(), True),
    StructField('srcport', StringType(), True),
    StructField('dstport', StringType(), True),
    StructField('srcpackets', LongType(), True),
    StructField('dstpackets', LongType(), True),
    StructField('srcbytes', LongType(), True),
    StructField('dstbytes', LongType(), True)
])

In [4]:
# Static DataFrame representing data in the JSON files
staticInputDF = spark.read.json(inputPath)

In [5]:
staticInputDF.printSchema()

root
 |-- dstbytes: string (nullable = true)
 |-- dstdevice: string (nullable = true)
 |-- dstpackets: string (nullable = true)
 |-- dstport: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- srcbytes: string (nullable = true)
 |-- srcdevice: string (nullable = true)
 |-- srcpackets: string (nullable = true)
 |-- srcport: string (nullable = true)
 |-- time: string (nullable = true)



In [6]:
staticInputDF.select('dstdevice') \
    .where(col('dstport').isin([80, 443])) \
    .groupby('dstdevice') \
    .count() \
    .sort(desc('count')) \
    .show(10)

+-------------------+-----+
|          dstdevice|count|
+-------------------+-----+
|EnterpriseAppServer|14495|
|         Comp576843|14153|
|         Comp186884|12681|
|         Comp501516| 5859|
|         Comp393033| 3795|
|         Comp916004| 3332|
|         Comp498128| 2831|
|         Comp573929| 2555|
|         Comp611862| 2404|
|         Comp370444| 2385|
+-------------------+-----+
only showing top 10 rows



In [7]:
streamingInputDF = (
  spark
    .readStream                       
    .schema(flowSchema)               # Set the schema of the JSON data
    .option("maxFilesPerTrigger", 1)  # Treat a sequence of files as a stream by picking one file at a time
    .json(inputPath)
)

In [8]:
streamingCountsDF = streamingInputDF \
    .select('dstdevice') \
    .where(col('dstport').isin([80, 443])) \
    .groupBy(streamingInputDF.dstdevice) \
    .count() \
    .sort(desc('count'))

# Is this DF actually a streaming DF?
streamingCountsDF.isStreaming

True

In [9]:
spark.conf.set("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small

query = (
  streamingCountsDF
    .writeStream
    .format("memory")       
    .queryName("counts")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)

In [10]:
# let the query run for a bit to insure there is data in the recent progress structure.
time.sleep(4)

# now let's look at this. We can eventually make this a bokeh plot too.
while query.recentProgress[-1]['sources'][0]['endOffset']['logOffset'] < numFileOffset:
    spark.sql("select * from counts").show()
    time.sleep(1)

+-------------------+-----+
|          dstdevice|count|
+-------------------+-----+
|EnterpriseAppServer| 3002|
|         Comp576843| 2760|
|         Comp186884| 1974|
|         Comp501516| 1469|
|         Comp916004|  717|
|         Comp393033|  643|
|         Comp611862|  479|
|         Comp370444|  465|
|         Comp657655|  434|
|         Comp097048|  347|
|         Comp574103|  309|
|         Comp146745|  303|
|         Comp457448|  266|
|         Comp573929|  261|
|         Comp847595|  256|
|         Comp216677|  247|
|         Comp523500|  244|
|         Comp253298|  242|
|         Comp309567|  238|
|         Comp509586|  230|
+-------------------+-----+
only showing top 20 rows

+-------------------+-----+
|          dstdevice|count|
+-------------------+-----+
|EnterpriseAppServer| 4144|
|         Comp576843| 3877|
|         Comp186884| 2636|
|         Comp501516| 1814|
|         Comp498128| 1464|
|         Comp916004|  978|
|         Comp370444|  843|
|         Comp657655| 

+-------------------+-----+
|          dstdevice|count|
+-------------------+-----+
|EnterpriseAppServer|12997|
|         Comp576843|12935|
|         Comp186884|11756|
|         Comp501516| 5373|
|         Comp393033| 3114|
|         Comp916004| 3043|
|         Comp498128| 2819|
|         Comp573929| 2554|
|         Comp370444| 2241|
|         Comp611862| 2179|
|         Comp097048| 1863|
|         Comp847595| 1709|
|         Comp657655| 1590|
|         Comp574103| 1486|
|         Comp309567| 1462|
|         Comp216677| 1427|
|         Comp336938| 1397|
|         Comp509586| 1385|
|         Comp146745| 1343|
|         Comp162943| 1081|
+-------------------+-----+
only showing top 20 rows

+-------------------+-----+
|          dstdevice|count|
+-------------------+-----+
|EnterpriseAppServer|13982|
|         Comp576843|13818|
|         Comp186884|12443|
|         Comp501516| 5771|
|         Comp393033| 3628|
|         Comp916004| 3271|
|         Comp498128| 2831|
|         Comp573929| 

In [11]:
spark.sql("select * from counts").show()

+-------------------+-----+
|          dstdevice|count|
+-------------------+-----+
|EnterpriseAppServer|14495|
|         Comp576843|14153|
|         Comp186884|12681|
|         Comp501516| 5859|
|         Comp393033| 3795|
|         Comp916004| 3332|
|         Comp498128| 2831|
|         Comp573929| 2555|
|         Comp611862| 2404|
|         Comp370444| 2385|
|         Comp097048| 1991|
|         Comp847595| 1886|
|         Comp574103| 1629|
|         Comp657655| 1590|
|         Comp309567| 1576|
|         Comp216677| 1528|
|         Comp509586| 1516|
|         Comp336938| 1501|
|         Comp146745| 1451|
|         Comp457448| 1180|
+-------------------+-----+
only showing top 20 rows

