In [1]:
#Aim is to count the number of each unique word that appeared in the book.
#The words should not contain punctuations.

#Functions used: map, flatMap, countByValue
#RegEx is used to eliminate punctautions from words.

In [2]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext.getOrCreate(conf=conf)

In [3]:
input = sc.textFile("/FileStore/tables/Book")
for i in input.collect():
  print(i)

In [4]:
#Here using the "flatMap" function, we can split each word into multiple rows.
words = input.flatMap(lambda x: x.split())  #Splitting the entire book by "white space"

for i in words.collect():
  print(i)

In [5]:
# Now let's see the count of each unique work in the above book using "countByValye" function.

wordCounts = words.countByValue()

type(wordCounts)  # Now this countWords is a dictionary with key and countValues and not a RDD.

for i in wordCounts.items():  #For dictionary we use d.items()
  print (i)

In [6]:
# Better way to print key (word) and count of values

for word, count in wordCounts.items():
  cleanWord = word.encode('ascii', 'ignore')  #General code to encode words in ASCII
  if (cleanWord):
    print (str(cleanWord), count)

In [7]:
# Looking at the above results, we can see that parsed words also has punctuations.
# So we need to clean the words further.
# Therefore we will use regular expression.

import re
def normalizeWords(text):
  return re.compile(r'\W+', re.UNICODE).split(text.lower())

In [8]:
# Now we defined the function using reg ex to split the words, we will repeat the same procedure as above.
words = input.flatMap(normalizeWords)
type(words)
for i in words.collect():
  print(i)

In [9]:
# Now the words look cleaner without the punctuation marks.
wordCount = words.countByValue()
type(wordCount)  #Dictionary
for i in wordCount.items():
  print(i)

In [10]:
#For better output:
for word, count in wordCount.items():
  cleanWord = word.encode('ascii', 'ignore')
  if cleanWord:
    print (cleanWord, count)

In [11]:
# Let's sort this output in descending order of count of words:

sorted_wordCount = sorted(wordCount.items(), key = lambda x: x[1], reverse = True)

In [12]:
sorted_wordCount

In [13]:
# Another method to sort by count of words:

words = input.flatMap(normalizeWords)    #Repeated step # We get splitted each word here (not the aggregated)

#STEP 1: Form a key, value pair of each word with 1.
wordPair = words.map(lambda x: (x , 1))

for i in wordPair.collect()[0:5]:
  print(i)


In [14]:
#STEP 2: Add the pairs with the same keys.
wordCounts = wordPair.reduceByKey(lambda x,y : x + y)

for i in wordCounts.collect()[0:5]:
  print(i)  # each 'i' is a tuple

In [15]:
#STEP 3: Invert the position of key and value i.e. (value, key) and sort them by key (as now 'key' is the actual value)
sortedWordCount = wordCounts.map(lambda x: (x[1], x[0])).sortByKey(ascending=False)

for i in sortedWordCount.collect():
  print (i[1], '\t', i[0])