In [None]:
# Generate a Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from Sockets')
    .master("local[*]")
    .getOrCreate()
)

spark

In [None]:
# Read input data

# df_raw = spark.read.format("text").load("example.txt") # batch processing code

# streaming version
df_raw = spark.readStream.format("socket").option("host","localhost").option("port", "9999").load()

In [None]:
df_raw.printSchema()

#### Show commands ".show()" are not required in the case of streaming source.

In [None]:
# Split the line into words
from pyspark.sql.functions import split

df_words = df_raw.withColumn("words", split("value", " "))

In [None]:
df_words.show() 

In [None]:
# Explode the list of words
from pyspark.sql.functions import explode

df_explode = df_words.withColumn("word", explode("words")).drop("value", "words")

In [None]:
df_explode.show() 

In [None]:
# Aggregate the words to generate count
from pyspark.sql.functions import count, lit

df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("cnt"))

In [None]:
df_agg.show()

In [None]:
# Write the output to console streaming

df_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()