In [23]:
from pyspark.sql import SparkSession

spark = (
	SparkSession
	.builder
	.master("local[*]")
	.appName("spark  streaming")
	.getOrCreate()
)

In [24]:
spark

In [25]:
# Read input data

df_raw = spark.read.format("text").load("/home/jupyter/spark-streaming/data/input/example.txt")


In [26]:
df_raw.printSchema()

root
 |-- value: string (nullable = true)



In [27]:
df_raw.show()

+--------------------+
|               value|
+--------------------+
|simone had a dog ...|
+--------------------+



In [30]:
#  split the line into words
from pyspark.sql.functions import split
df_words = df_raw.withColumn("words",split("value"," "))

In [31]:
df_words.show()

+--------------------+--------------------+
|               value|               words|
+--------------------+--------------------+
|simone had a dog ...|[simone, had, a, ...|
+--------------------+--------------------+



In [32]:
# Explode the llist of words

from pyspark.sql.functions import explode

df_explode = df_words.withColumn("word", explode("words"))

In [33]:
df_explode.show()

+--------------------+--------------------+------+
|               value|               words|  word|
+--------------------+--------------------+------+
|simone had a dog ...|[simone, had, a, ...|simone|
|simone had a dog ...|[simone, had, a, ...|   had|
|simone had a dog ...|[simone, had, a, ...|     a|
|simone had a dog ...|[simone, had, a, ...|   dog|
|simone had a dog ...|[simone, had, a, ...|   and|
|simone had a dog ...|[simone, had, a, ...|     a|
|simone had a dog ...|[simone, had, a, ...|   cat|
|simone had a dog ...|[simone, had, a, ...|      |
|simone had a dog ...|[simone, had, a, ...|   the|
|simone had a dog ...|[simone, had, a, ...|   dog|
|simone had a dog ...|[simone, had, a, ...|   and|
|simone had a dog ...|[simone, had, a, ...|   cat|
|simone had a dog ...|[simone, had, a, ...|  used|
|simone had a dog ...|[simone, had, a, ...|    to|
|simone had a dog ...|[simone, had, a, ...|  love|
|simone had a dog ...|[simone, had, a, ...| simon|
+--------------------+---------

In [34]:
df_explode = df_words.withColumn("word", explode("words")).drop("value","words")

df_explode.show()

+------+
|  word|
+------+
|simone|
|   had|
|     a|
|   dog|
|   and|
|     a|
|   cat|
|      |
|   the|
|   dog|
|   and|
|   cat|
|  used|
|    to|
|  love|
| simon|
+------+



In [36]:
# Aggregate the words to generate count

from pyspark.sql.functions import count, lit

df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("cnt"))
df_agg.show()

+------+---+
|  word|cnt|
+------+---+
|  used|  1|
| simon|  1|
|simone|  1|
|   dog|  2|
|  love|  1|
|   had|  1|
|   cat|  2|
|   the|  1|
|   and|  2|
|     a|  2|
|      |  1|
|    to|  1|
+------+---+



# Streaming

In [37]:
# Read input data

df_raw = spark.readStream.format("socket").option("host","localhost").option("port","9999").load()

In [38]:
df_raw.printSchema()

root
 |-- value: string (nullable = true)



In [39]:
#  split the line into words
from pyspark.sql.functions import split
df_words = df_raw.withColumn("words",split("value"," "))


In [40]:
# Explode the llist of words

from pyspark.sql.functions import explode

df_explode = df_words.withColumn("word", explode("words"))

In [41]:
df_explode = df_words.withColumn("word", explode("words")).drop("value","words")



In [43]:
# Aggregate the words to generate count

from pyspark.sql.functions import count, lit

df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("cnt"))


In [46]:
# Write the output o console streaming

df_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()