In [None]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [None]:
bucket_name = "BUCKET_NAME"
lines = sc.textFile(bucket_name)
words_rdd = lines.flatMap(lambda x : x.split())
words_rdd.take(5)

In [None]:
word_counts = words_rdd.countByValue()
for word, count in word_counts.items():
    clean_word = word.encode('ascii', 'ignore')
    if clean_word:
        print(clean_word, count)





## We got some results but it includes punctuations which adds complexity to our result set

We can use regex to improve our analysis

In [10]:
import re
def normalizeWords(lines):
    return re.compile(r'\W+', re.UNICODE).split(lines.lower())

bucket_name = "BUCKET_NAME"




In [None]:
lines = sc.textFile(bucket_name)
words_rdd = lines.flatMap(normalizeWords)
words_rdd.take(5)

In [None]:
word_counts = words_rdd.countByValue()
for word, count in word_counts.items():
    clean_word = word.encode('ascii', 'ignore')
    if clean_word:
        print(clean_word, count)


## Now, we also need to sort RDD to find most popular word in the book

To do that, we need a key-value RDD as we can only sort by keys. We can swap the keys to be the numbers and values to be the words that appear that number of times, and sort by key.

In [None]:
word_counts_manual = words_rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
word_counts_manual.take(5)

In [None]:
word_counts_manual_sorted = word_counts_manual.map(lambda x: (x[1], x[0])).sortByKey(ascending = False)
for count, word in word_counts_manual_sorted.collect():
    clean_word = word.encode('ascii', 'ignore')
    if clean_word:
        print(clean_word, count)