In [1]:
from __future__ import print_function

import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

In [2]:
host = "52.66.45.236"
port = 8080

In [3]:
spark = SparkSession\
    .builder\
    .appName("StructuredNetworkWordCount")\
    .getOrCreate()

# Use a smaller number for the shuffle partitions
spark.conf.set("spark.sql.shuffle.partitions", 1)

In [4]:
# Create DataFrame representing the stream of input lines from connection to host:port
lines = spark\
    .readStream\
    .format('socket')\
    .option('host', host)\
    .option('port', port)\
    .load()

lines.printSchema()

In [5]:
# Split the lines into words
words = lines.select(
    # explode turns each item in an array into a separate row
    explode(
    split(lines.value, ' ')
    ).alias('word')
)

words.printSchema()

In [6]:
# Generate running word count
wordCounts = words.groupBy('word').count()

wordCounts.printSchema()

In [7]:
# Start running the query in complete output mode that prints the running counts to the console
completeOutputQuery = wordCounts\
    .writeStream\
    .outputMode('complete')\
    .format('console')\
    .start()

completeOutputQuery.awaitTermination()

In [8]:
# Start running the query in update output mode that prints the running counts to the console
updateOutputQuery = wordCounts\
    .writeStream\
    .outputMode('update')\
    .format('console')\
    .start()

updateOutputQuery.awaitTermination()