# Parsing Text and Cleaning

In [1]:
from pyspark.sql import SparkSession
from operator import add

In [2]:
spark = SparkSession.builder.appName("Python Spark Session").getOrCreate()

In [3]:
rdd_df = spark.read.text("shake.txt").rdd

In [4]:
lines = rdd_df.map(lambda line: line[0])

In [5]:
lines.collect()

['To be, or not to be, that is the question:',
 "Whether 'tis nobler in the mind to suffer",
 'The slings and arrows of outrageous fortune,',
 'Or to take arms against a sea of troubles',
 'And by opposing end them. To die—to sleep,',
 'No more; and by a sleep to say we end',
 'The heart-ache and the thousand natural shocks',
 "That flesh is heir to: 'tis a consummation",
 "Devoutly to be wish'd. To die, to sleep;",
 "To sleep, perchance to dream—ay, there's the rub:",
 'For in that sleep of death what dreams may come,',
 'When we have shuffled off this mortal coil,',
 "Must give us pause—there's the respect",
 'That makes calamity of so long life.']

In [6]:
lines.count()

14

In [7]:
splits = lines.flatMap(lambda x: x.split(' '))

In [8]:
splits.collect()

['To',
 'be,',
 'or',
 'not',
 'to',
 'be,',
 'that',
 'is',
 'the',
 'question:',
 'Whether',
 "'tis",
 'nobler',
 'in',
 'the',
 'mind',
 'to',
 'suffer',
 'The',
 'slings',
 'and',
 'arrows',
 'of',
 'outrageous',
 'fortune,',
 'Or',
 'to',
 'take',
 'arms',
 'against',
 'a',
 'sea',
 'of',
 'troubles',
 'And',
 'by',
 'opposing',
 'end',
 'them.',
 'To',
 'die—to',
 'sleep,',
 'No',
 'more;',
 'and',
 'by',
 'a',
 'sleep',
 'to',
 'say',
 'we',
 'end',
 'The',
 'heart-ache',
 'and',
 'the',
 'thousand',
 'natural',
 'shocks',
 'That',
 'flesh',
 'is',
 'heir',
 'to:',
 "'tis",
 'a',
 'consummation',
 'Devoutly',
 'to',
 'be',
 "wish'd.",
 'To',
 'die,',
 'to',
 'sleep;',
 'To',
 'sleep,',
 'perchance',
 'to',
 'dream—ay,',
 "there's",
 'the',
 'rub:',
 'For',
 'in',
 'that',
 'sleep',
 'of',
 'death',
 'what',
 'dreams',
 'may',
 'come,',
 'When',
 'we',
 'have',
 'shuffled',
 'off',
 'this',
 'mortal',
 'coil,',
 'Must',
 'give',
 'us',
 "pause—there's",
 'the',
 'respect',
 'That

In [9]:
lower_splits = splits.map(lambda x: x.lower().strip())

In [10]:
lower_splits.collect()

['to',
 'be,',
 'or',
 'not',
 'to',
 'be,',
 'that',
 'is',
 'the',
 'question:',
 'whether',
 "'tis",
 'nobler',
 'in',
 'the',
 'mind',
 'to',
 'suffer',
 'the',
 'slings',
 'and',
 'arrows',
 'of',
 'outrageous',
 'fortune,',
 'or',
 'to',
 'take',
 'arms',
 'against',
 'a',
 'sea',
 'of',
 'troubles',
 'and',
 'by',
 'opposing',
 'end',
 'them.',
 'to',
 'die—to',
 'sleep,',
 'no',
 'more;',
 'and',
 'by',
 'a',
 'sleep',
 'to',
 'say',
 'we',
 'end',
 'the',
 'heart-ache',
 'and',
 'the',
 'thousand',
 'natural',
 'shocks',
 'that',
 'flesh',
 'is',
 'heir',
 'to:',
 "'tis",
 'a',
 'consummation',
 'devoutly',
 'to',
 'be',
 "wish'd.",
 'to',
 'die,',
 'to',
 'sleep;',
 'to',
 'sleep,',
 'perchance',
 'to',
 'dream—ay,',
 "there's",
 'the',
 'rub:',
 'for',
 'in',
 'that',
 'sleep',
 'of',
 'death',
 'what',
 'dreams',
 'may',
 'come,',
 'when',
 'we',
 'have',
 'shuffled',
 'off',
 'this',
 'mortal',
 'coil,',
 'must',
 'give',
 'us',
 "pause—there's",
 'the',
 'respect',
 'that

In [11]:
prep = ['the','a',',','.']
tokens = lower_splits.filter(lambda x: x and x not in prep)

In [12]:
tokens.collect()

['to',
 'be,',
 'or',
 'not',
 'to',
 'be,',
 'that',
 'is',
 'question:',
 'whether',
 "'tis",
 'nobler',
 'in',
 'mind',
 'to',
 'suffer',
 'slings',
 'and',
 'arrows',
 'of',
 'outrageous',
 'fortune,',
 'or',
 'to',
 'take',
 'arms',
 'against',
 'sea',
 'of',
 'troubles',
 'and',
 'by',
 'opposing',
 'end',
 'them.',
 'to',
 'die—to',
 'sleep,',
 'no',
 'more;',
 'and',
 'by',
 'sleep',
 'to',
 'say',
 'we',
 'end',
 'heart-ache',
 'and',
 'thousand',
 'natural',
 'shocks',
 'that',
 'flesh',
 'is',
 'heir',
 'to:',
 "'tis",
 'consummation',
 'devoutly',
 'to',
 'be',
 "wish'd.",
 'to',
 'die,',
 'to',
 'sleep;',
 'to',
 'sleep,',
 'perchance',
 'to',
 'dream—ay,',
 "there's",
 'rub:',
 'for',
 'in',
 'that',
 'sleep',
 'of',
 'death',
 'what',
 'dreams',
 'may',
 'come,',
 'when',
 'we',
 'have',
 'shuffled',
 'off',
 'this',
 'mortal',
 'coil,',
 'must',
 'give',
 'us',
 "pause—there's",
 'respect',
 'that',
 'makes',
 'calamity',
 'of',
 'so',
 'long',
 'life.']

In [13]:
token_list = tokens.map(lambda x: [x,1])

In [14]:
token_list.collect()

[['to', 1],
 ['be,', 1],
 ['or', 1],
 ['not', 1],
 ['to', 1],
 ['be,', 1],
 ['that', 1],
 ['is', 1],
 ['question:', 1],
 ['whether', 1],
 ["'tis", 1],
 ['nobler', 1],
 ['in', 1],
 ['mind', 1],
 ['to', 1],
 ['suffer', 1],
 ['slings', 1],
 ['and', 1],
 ['arrows', 1],
 ['of', 1],
 ['outrageous', 1],
 ['fortune,', 1],
 ['or', 1],
 ['to', 1],
 ['take', 1],
 ['arms', 1],
 ['against', 1],
 ['sea', 1],
 ['of', 1],
 ['troubles', 1],
 ['and', 1],
 ['by', 1],
 ['opposing', 1],
 ['end', 1],
 ['them.', 1],
 ['to', 1],
 ['die—to', 1],
 ['sleep,', 1],
 ['no', 1],
 ['more;', 1],
 ['and', 1],
 ['by', 1],
 ['sleep', 1],
 ['to', 1],
 ['say', 1],
 ['we', 1],
 ['end', 1],
 ['heart-ache', 1],
 ['and', 1],
 ['thousand', 1],
 ['natural', 1],
 ['shocks', 1],
 ['that', 1],
 ['flesh', 1],
 ['is', 1],
 ['heir', 1],
 ['to:', 1],
 ["'tis", 1],
 ['consummation', 1],
 ['devoutly', 1],
 ['to', 1],
 ['be', 1],
 ["wish'd.", 1],
 ['to', 1],
 ['die,', 1],
 ['to', 1],
 ['sleep;', 1],
 ['to', 1],
 ['sleep,', 1],
 ['perchanc

In [15]:
count = token_list.reduceByKey(add).sortBy(lambda x: x[1], ascending=False)

In [16]:
count.collect()

[('to', 11),
 ('that', 4),
 ('and', 4),
 ('of', 4),
 ('be,', 2),
 ('or', 2),
 ('is', 2),
 ("'tis", 2),
 ('in', 2),
 ('by', 2),
 ('end', 2),
 ('sleep,', 2),
 ('sleep', 2),
 ('we', 2),
 ('not', 1),
 ('question:', 1),
 ('whether', 1),
 ('nobler', 1),
 ('mind', 1),
 ('suffer', 1),
 ('slings', 1),
 ('arrows', 1),
 ('outrageous', 1),
 ('fortune,', 1),
 ('take', 1),
 ('arms', 1),
 ('against', 1),
 ('sea', 1),
 ('troubles', 1),
 ('opposing', 1),
 ('them.', 1),
 ('die—to', 1),
 ('no', 1),
 ('more;', 1),
 ('say', 1),
 ('heart-ache', 1),
 ('thousand', 1),
 ('natural', 1),
 ('shocks', 1),
 ('flesh', 1),
 ('heir', 1),
 ('to:', 1),
 ('consummation', 1),
 ('devoutly', 1),
 ('be', 1),
 ("wish'd.", 1),
 ('die,', 1),
 ('sleep;', 1),
 ('perchance', 1),
 ('dream—ay,', 1),
 ("there's", 1),
 ('rub:', 1),
 ('for', 1),
 ('death', 1),
 ('what', 1),
 ('dreams', 1),
 ('may', 1),
 ('come,', 1),
 ('when', 1),
 ('have', 1),
 ('shuffled', 1),
 ('off', 1),
 ('this', 1),
 ('mortal', 1),
 ('coil,', 1),
 ('must', 1),
 (