In [16]:
import findspark
findspark.init()
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from operator import add
from pyspark import StorageLevel


In [17]:
spark = SparkSession.builder.appName('word_count').getOrCreate()

# 1. Dataframe way


1. Create the dataframe with one column and call it 'word'
2. Use F.split and get one column with all words separated in a list
3. Use F.explode to transform each item in the list to a row
4. Group by 'word' and aggregate using 'count' function


In [42]:
# Input
s = ["Spark is totally totally awesome!"]

In [43]:

words_df = spark.createDataFrame([s], ['word'])\
            .withColumn('word', F.explode(F.split(F.col('word'), ' ')))\
            .groupBy('word').agg(F.count('word'))\
            .withColumn('word', F.regexp_replace(F.col('word'), r"^(.*)[\!@#\$%&*\(\)_\-\+\=]+(.*)$", "$1$2"))\
            .persist(StorageLevel.MEMORY_AND_DISK)\
            .show()

+-------+-----------+
|   word|count(word)|
+-------+-----------+
|totally|          2|
|     is|          1|
|  Spark|          1|
|awesome|          1|
+-------+-----------+



# Dataframe + SQL
1. Create a dataframe 
2. Split and explode exactly as before
3. Create a temporary view
4. count and group using SQL



In [34]:
# Create a view first
s = ["Spark is really really awesome!"]
lines_df = spark.createDataFrame([s], ['word'])\
            .withColumn('word', F.explode(F.split(F.col('word'), ' ')))
lines_df.createOrReplaceTempView('lines')


In [40]:
# Select data from the view simply using Spark SQL
spark.sql("""select word, count(word) from lines group by word""").show()

+--------+-----------+
|    word|count(word)|
+--------+-----------+
|      is|          1|
|  really|          2|
|   Spark|          1|
|awesome!|          1|
+--------+-----------+



# RDD way(not recommended in my opinion)




In [39]:
# If you like map/reduce crap, go ahead!
s = "Spark is really really awesome!"
spark.sparkContext\
 .parallelize(s.split()).map(lambda x:(x, 1))\
 .reduceByKey(add).collect()

[('really', 2), ('Spark', 1), ('awesome!', 1), ('is', 1)]