In [24]:
sentences = [["this is a sentence in line1 with each word count is exactly once"]]

In [10]:
from pyspark.sql import SparkSession
spark = SparkSession\
   .builder\
   .getOrCreate()

In [25]:
df = spark.createDataFrame(sentences).toDF("text_column")

In [26]:
df.show(truncate=False)

+----------------------------------------------------------------+
|text_column                                                     |
+----------------------------------------------------------------+
|this is a sentence in line1 with each word count is exactly once|
+----------------------------------------------------------------+



In [43]:
from pyspark.sql.functions import size, split, explode, lit

In [47]:
df1 = df.select(split("text_column", " ").alias("words_column"))

In [48]:
df1.select(size("words_column")).show()

+------------------+
|size(words_column)|
+------------------+
|                13|
+------------------+



In [54]:
df2 = df1.select(explode("words_column").alias("word"))

In [55]:
df3 = df2.withColumn("word_count", lit(1))

In [56]:
df3.show()

+--------+----------+
|    word|word_count|
+--------+----------+
|    this|         1|
|      is|         1|
|       a|         1|
|sentence|         1|
|      in|         1|
|   line1|         1|
|    with|         1|
|    each|         1|
|    word|         1|
|   count|         1|
|      is|         1|
| exactly|         1|
|    once|         1|
+--------+----------+



In [60]:
df4 = df3.groupBy("word").agg({"word_count": "sum"})

In [62]:
df4 = df4.withColumnRenamed("sum(word_count)", "final_word_count")

In [63]:
df4.show()

+--------+----------------+
|    word|final_word_count|
+--------+----------------+
|   line1|               1|
| exactly|               1|
|      in|               1|
|    with|               1|
|   count|               1|
|      is|               2|
|    each|               1|
|sentence|               1|
|    word|               1|
|    once|               1|
|       a|               1|
|    this|               1|
+--------+----------------+



In [65]:
from pyspark.sql.functions import create_map

In [67]:
df4.select(create_map("word", "final_word_count")).show()

+---------------------------+
|map(word, final_word_count)|
+---------------------------+
|               [line1 -> 1]|
|             [exactly -> 1]|
|                  [in -> 1]|
|                [with -> 1]|
|               [count -> 1]|
|                  [is -> 2]|
|                [each -> 1]|
|            [sentence -> 1]|
|                [word -> 1]|
|                [once -> 1]|
|                   [a -> 1]|
|                [this -> 1]|
+---------------------------+

