# Count top 24 words from Shakespeare using Apache Spark

Follow this excellent guide to use PySpark and Jupyter on your Windows system: 
https://changhsinlee.com/install-pyspark-windows-jupyter/

In [1]:
import findspark # pip install findspark
findspark.init("C:/YOUR-PATH-TO-SPARK") # insert path where spark is installed
# NOTE: if you copy the path from your explorer, don't forget to replace the "\" with "/" since python interprets them as escape characters

In [2]:
from pyspark import SparkContext
from operator import add

In [3]:
spark = SparkContext(appName="Wordcount")
contentRDD =spark.textFile("stripped_shakespeare.txt") # this is an RDD; insert filename

## Do the actual counting - sorting - printing

In [4]:
# counts is an rdd is of the form (word, count)
counts = contentRDD.flatMap(lambda x: [(w.lower(), 1) for w in x.split()]).reduceByKey(add)

# collect brings it to a list in local memory
output = counts.collect()

In [5]:
# this would print the whole list of counted words
# for (word, count) in output:
#     print("%s: %i" % (word, count))

In [6]:
# sort the list of tuples by the count value
sorted_output = sorted(output, key=lambda x: x[1], reverse=True)

In [7]:
# Print the first 24 entries of the sorted_output list of tuples
print(sorted_output[:24])

[('the', 27482), ('and', 25991), ('i', 19540), ('to', 18656), ('of', 17952), ('a', 14365), ('my', 12455), ('in', 10660), ('you', 10597), ('that', 10473), ('is', 9115), ('for', 7948), ('with', 7923), ('not', 7633), ('your', 6861), ('his', 6749), ('be', 6684), ('he', 5884), ('but', 5881), ('as', 5875), ('this', 5859), ('it', 5858), ('have', 5675), ('thou', 5138)]


## IMPORTANT! Kill the Spark process

In [8]:
spark.stop()