In [1]:
import re
from pyspark.sql import SparkSession

In [2]:
# Create PySpark Session
spark = SparkSession.builder.appName("LetterCount").getOrCreate()
# Read input file and store it as a Resilient Distributed Dataset
text = spark.sparkContext.textFile("./input.txt")
# Print first 10 lines
print(text.take(10))

['The Project Gutenberg EBook of The Complete Works of William Shakespeare, by', 'William Shakespeare', '', 'This eBook is for the use of anyone anywhere at no cost and with', 'almost no restrictions whatsoever.  You may copy it, give it away or', 're-use it under the terms of the Project Gutenberg License included', 'with this eBook or online at www.gutenberg.org', '', '** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **', '**     Please follow the copyright guidelines in this file.     **']


In [3]:
def splitter(line):
    # Remove any non-words
    line = re.sub(r"^\W+|\W+$", "", line)
    # Split lines into separate words and convert all words to lowercase
    words = map(str.lower, re.split(r"\W+", line))
    # Replace non-letter characters with empty strings
    words = [re.sub(r'[^a-z]+', '', word) for word in words]
    # Remove all empty strings
    words = list(filter(None, words))
    # Return a list of the first character of each words
    return [word[0] for word in words]

# Create an RDD of first letter of each word in the input file
letters = text.flatMap(splitter)
print(letters.take(25))

['t', 'p', 'g', 'e', 'o', 't', 'c', 'w', 'o', 'w', 's', 'b', 'w', 's', 't', 'e', 'i', 'f', 't', 'u', 'o', 'a', 'a', 'a', 'n']


In [4]:
# Map Step

# Assign a count of 1 to each letter
mapped_letters = letters.map(lambda x: (x,1))
print(mapped_letters.take(10))

[('t', 1), ('p', 1), ('g', 1), ('e', 1), ('o', 1), ('t', 1), ('c', 1), ('w', 1), ('o', 1), ('w', 1)]


In [5]:
# Reduce Step
from operator import add

# Aggregate the total count of each letter
counts = mapped_letters.reduceByKey(add)
print(counts.take(10))

[('p', 28059), ('g', 21167), ('c', 34983), ('s', 75226), ('b', 46001), ('i', 62420), ('r', 15234), ('y', 25926), ('l', 32389), ('d', 39173)]


In [6]:
# Store letter_counts as a dictionary with starting letter as key and count as value
letter_counts = {item[0] : item[1] for item in counts.collect()}

In [7]:
# Save result as JSON
import json
with open("./output/letter_count.json", "w") as outfile:
    json.dump(letter_counts, outfile, indent = 4)