In [10]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, MapType, ArrayType
from operator import add

In [11]:
spark = SparkSession.builder.master('local').appName('word_counter').getOrCreate()

### Dataframe way

 1. Create the dataframe with one column and call it 'word'
 2. Use F.split and get one column with all words separated in a list
 3. Use F.explode to transform each item in the list to a row
 4. Group by 'word' and aggregate using 'count' function

    

In [12]:
# Input
s = ["Spark is totally totally awesome!"]

In [13]:
words_df = spark.createDataFrame([s], ['word'])\
            .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\
            .groupBy('word').agg(F.count('word'))\
            .withColumn('word', F.regexp_replace(F.col('word'), r"^(.*)[\!@#\$%&*\(\)_\-\+\=]+(.*)$", "$1$2"))\
            .show()

+-------+-----------+
|   word|count(word)|
+-------+-----------+
|totally|          2|
|     is|          1|
|  Spark|          1|
|awesome|          1|
+-------+-----------+



### Dataframe + SQL

1. Create a dataframe and a view from it
2. Split and explode exactly as before
3. Create a temporary view
4. Count and group using SQL


In [14]:
# Create a data frame and a view
s = ["Spark is really really awesome!"]
lines_df = spark.createDataFrame([s], ['word'])\
            .withColumn('word', F.explode(F.split(F.col('word'), ' ')))
lines_df.createOrReplaceTempView('lines')

In [15]:
# Select data from the view simply using Spark SQL
spark.sql("""select word, count(word) from lines group by word""").show()

+--------+-----------+
|    word|count(word)|
+--------+-----------+
|      is|          1|
|  really|          2|
|   Spark|          1|
|awesome!|          1|
+--------+-----------+



### RDD way

In [16]:
# If you like map/reduce crap, go ahead!
s = "Spark is really really awesome!"
spark.sparkContext\
 .parallelize(s.split()).map(lambda x:(x, 1))\
 .reduceByKey(add).collect()

[('Spark', 1), ('is', 1), ('really', 2), ('awesome!', 1)]