In [1]:
from pyspark.sql import (
    functions,
    Row,
    SparkSession
)

In [2]:
spark = SparkSession.builder.appName("df_wordcount").getOrCreate()

In [3]:
df = spark.createDataFrame([
        Row(a=1,
            intlist=[1, 2, 3],
            mapfield={"a": "b", "d": 1}
           )
])
df.show()

+---+---------+----------------+
|  a|  intlist|        mapfield|
+---+---------+----------------+
|  1|[1, 2, 3]|{d -> 1, a -> b}|
+---+---------+----------------+



In [5]:
# functions.explode()
# Returns a new row for each element in the given array or map
df1 = df.select(functions.explode(df.intlist).alias("anInt"))
df1.show()

+-----+
|anInt|
+-----+
|    1|
|    2|
|    3|
+-----+



In [6]:
# function.split(str, pattern, limit=-1)
# Splits str around matches of the given pattern.
df = spark.createDataFrame([
    Row(word="hello world and pyspark")])
df.show()
df = df.select(functions.split(df.word, ' ').alias("word"))
df.show()
df = df.select(functions.explode(df.word).alias("words"))
df.show()

+--------------------+
|                word|
+--------------------+
|hello world and p...|
+--------------------+

+--------------------+
|                word|
+--------------------+
|[hello, world, an...|
+--------------------+

+-------+
|  words|
+-------+
|  hello|
|  world|
|    and|
|pyspark|
+-------+



In [8]:
csv_file_path = "file:///home/jovyan/work/lorem_ipsum.txt"
df = spark.read.text(csv_file_path)
df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [9]:
words = df.select(
    functions.explode(
        functions.split(df.value, ' ')).alias("word"))
words.show()

+-----------+
|       word|
+-----------+
|      Lorem|
|      Ipsum|
|         is|
|     simply|
|      dummy|
|       text|
|         of|
|        the|
|   printing|
|        and|
|typesetting|
|  industry.|
|      Lorem|
|      Ipsum|
|        has|
|       been|
|        the|
| industry's|
|   standard|
|      dummy|
+-----------+
only showing top 20 rows



In [10]:
words_counts = words.groupBy("word").count().orderBy(functions.col("count").desc())
words_counts.show()

+----------+-----+
|      word|count|
+----------+-----+
|       the|    6|
|     Lorem|    4|
|        of|    4|
|       and|    3|
|     Ipsum|    3|
|      with|    2|
|        It|    2|
|     dummy|    2|
|      type|    2|
|      text|    2|
|         a|    2|
|       has|    2|
|publishing|    1|
|unchanged.|    1|
|    sheets|    1|
|   desktop|    1|
|       not|    1|
|     1960s|    1|
|  Letraset|    1|
|    Ipsum.|    1|
+----------+-----+
only showing top 20 rows

