# Cluster Test 

WordCount is the "canary in the coal mine" for testing clusters because counting the frequency of the occurence of words in text is "embarrassingly parallel". As a result, testing access to the cluster usually involves running a simple wordcount on a file in HDFS. 

In [2]:
## Imports
import os 

from os import path
from operator import add 

In [3]:
## Helper Variables
HDFS = "hdfs://{}".format(os.environ['HDFS'])
USER = path.join(HDFS, "user", "ec2-user")

In [4]:
# Conduct the wordcount, loading shakespeare as a text RDD. 
text = sc.textFile(path.join(USER, "shakespeare.txt"))

# Map the split function and assign each word a token of 1 
text = text.flatMap(lambda s: s.split())
text = text.map(lambda s: (s, 1))

# Reduce the words by summing their counts, then sort by word count. 
text = text.reduceByKey(add)
text = text.sortBy(lambda s: s[1], ascending=False)

In [5]:
# Print out the most common words 
text.take(20)

[(u'the', 25816),
 (u'I', 20402),
 (u'and', 19254),
 (u'to', 17222),
 (u'of', 16535),
 (u'a', 13870),
 (u'my', 11280),
 (u'in', 10347),
 (u'you', 9267),
 (u'is', 8239),
 (u'that', 7980),
 (u'And', 7313),
 (u'not', 7257),
 (u'with', 7221),
 (u'his', 6712),
 (u'be', 6293),
 (u'your', 6236),
 (u'for', 5974),
 (u'have', 5438),
 (u'it', 5214)]