Flatmap() can create many other multiple elements using a single one

In [26]:
import re
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setAppName("flat_map")

In [3]:
sc = SparkContext(conf=conf)

In [5]:
lines_rdd = sc.textFile("file:///var/lib/spark/jupyter/data/book.txt")

Below way of doing it is not that great

In [None]:
# words = lines_rdd.flatMap(lambda x:x.split())

In [23]:
# word_count = words.countByValue()

Slightly better way 

In [27]:
def parse_text(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

In [28]:
words = lines_rdd.flatMap(parse_text)

In [36]:
word_count = words.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)

In [47]:
sorted_word = word_count.map(lambda (x,y):(y,x)).sortByKey("desc").map(lambda (x,y):(y,x))

In [49]:
result = sorted_word.collect()

In [51]:
for res in result:
    print(res[0], res[1])

(u'hats', 1)
(u'desirable', 1)
(u'four', 1)
(u'relationships', 1)
(u'extrapolate', 1)
(u'adaptive', 1)
(u'fondly', 1)
(u'announced', 1)
(u'force', 1)
(u'consistent', 1)
(u'electricians', 1)
(u'asia', 1)
(u'mailings', 1)
(u'disposable', 1)
(u'104', 1)
(u'obtained', 1)
(u'forum', 1)
(u'harass', 1)
(u'calculations', 1)
(u'astute', 1)
(u'criticism', 1)
(u'psychological', 1)
(u'conflicts', 1)
(u'achievable', 1)
(u'counts', 1)
(u'landscape', 1)
(u'catchy', 1)
(u'hospital', 1)
(u'strike', 1)
(u'breathe', 1)
(u'award', 1)
(u'93', 1)
(u'outdated', 1)
(u'divorced', 1)
(u'pursue', 1)
(u'roof', 1)
(u'exceptions', 1)
(u'addictive', 1)
(u'household', 1)
(u'appointment', 1)
(u'machine', 1)
(u'hot', 1)
(u'fudge', 1)
(u'mouths', 1)
(u'curiosity', 1)
(u'chump', 1)
(u'types', 1)
(u'attempt', 1)
(u'attracts', 1)
(u'teammates', 1)
(u'goodbye', 1)
(u'realizing', 1)
(u'salon', 1)
(u'rewarded', 1)
(u'digest', 1)
(u'edits', 1)
(u'401', 1)
(u'weeks', 1)
(u'versus', 1)
(u'affected', 1)
(u'prevented', 1)
(u'unemp

(u'outreach', 1)
(u'continuous', 1)
(u'released', 1)
(u'population', 1)
(u'12', 1)
(u'18', 1)
(u'summary', 1)
(u'transferred', 1)
(u'r', 1)
(u'fruits', 1)
(u'proofread', 1)
(u'russia', 1)
(u'published', 1)
(u'documentation', 1)
(u'san', 1)
(u'yoast', 1)
(u'diligence', 1)
(u'plunge', 1)
(u'repeatedly', 1)
(u'opposite', 1)
(u'supporting', 1)
(u'proud', 1)
(u'quantity', 1)
(u'laureate', 1)
(u'filed', 1)
(u'corner', 1)
(u'upgrade', 1)
(u'finalized', 1)
(u'nearly', 1)
(u'secondary', 1)
(u'reveal', 1)
(u'artist', 1)
(u'luxuries', 1)
(u'miles', 1)
(u'communicated', 1)
(u'ignored', 1)
(u'concert', 1)
(u'burst', 1)
(u'signs', 1)
(u'parking', 1)
(u'violates', 1)
(u'wonder', 1)
(u'receipt', 1)
(u'subsequent', 1)
(u'skyscraper', 1)
(u'perpetuity', 1)
(u'multiplied', 1)
(u'replacement', 1)
(u'reaction', 1)
(u'talented', 1)
(u'tuck', 1)
(u'trades', 1)
(u'landscaper', 1)
(u'delete', 1)
(u'nearby', 1)
(u'afterwards', 1)
(u'mails', 1)
(u'65', 1)
(u'graphics', 1)
(u'cares', 1)
(u'brainwashing', 1)
(u'ju

(u'rankings', 2)
(u'split', 2)
(u'luck', 2)
(u'independent', 2)
(u'series', 2)
(u'aids', 2)
(u'adapt', 2)
(u'wanted', 2)
(u'tags', 2)
(u'instagram', 2)
(u'convention', 2)
(u'wisdom', 2)
(u'horn', 2)
(u'ebay', 2)
(u'acquire', 2)
(u'though', 2)
(u'visitor', 2)
(u'advisors', 2)
(u'solutions', 2)
(u'sexism', 2)
(u'situations', 2)
(u'announce', 2)
(u'professionals', 2)
(u'despite', 2)
(u'acknowledgments', 2)
(u'decided', 2)
(u'projections', 2)
(u'disturbing', 2)
(u'cut', 2)
(u'wear', 2)
(u'patented', 2)
(u'refinement', 2)
(u'gloating', 2)
(u'101', 2)
(u'mitigation', 2)
(u'deeper', 2)
(u'xxx', 2)
(u'inertia', 2)
(u'complex', 2)
(u'workforce', 2)
(u'maximizing', 2)
(u'programs', 2)
(u'previous', 2)
(u'gave', 2)
(u'apart', 2)
(u'unnecessary', 2)
(u'statistic', 2)
(u'dropping', 2)
(u'discount', 2)
(u'census', 2)
(u'arms', 2)
(u'raises', 2)
(u'editor', 2)
(u'versions', 2)
(u'stages', 2)
(u'lately', 2)
(u'escape', 2)
(u'tie', 2)
(u'picture', 2)
(u'laws', 2)
(u'whoever', 2)
(u'polluted', 2)
(u'gov

(u'familiar', 4)
(u'favor', 4)
(u'decent', 4)
(u'quiz', 4)
(u'parts', 4)
(u'effect', 4)
(u'rift', 4)
(u'surge', 4)
(u'gain', 4)
(u'eat', 4)
(u'searched', 4)
(u'sleep', 5)
(u'above', 5)
(u'forums', 5)
(u'incubator', 5)
(u'travel', 5)
(u'vary', 5)
(u'went', 5)
(u'turning', 5)
(u'written', 5)
(u'rank', 5)
(u'second', 5)
(u'project', 5)
(u'seek', 5)
(u'recommended', 5)
(u'headline', 5)
(u'salespeople', 5)
(u'respond', 5)
(u'near', 5)
(u'launched', 5)
(u'sells', 5)
(u'meetup', 5)
(u'human', 5)
(u'yes', 5)
(u'county', 5)
(u'decisions', 5)
(u'post', 5)
(u'devices', 5)
(u'profits', 5)
(u'word', 5)
(u'clouds', 5)
(u'regardless', 5)
(u'cases', 5)
(u'hope', 5)
(u'placements', 5)
(u'whenever', 5)
(u'group', 5)
(u'whole', 5)
(u'fast', 5)
(u'sounds', 5)
(u'creatively', 5)
(u'review', 5)
(u'somewhere', 5)
(u'spare', 5)
(u'endeavor', 5)
(u'coffee', 5)
(u'relies', 5)
(u'minimize', 5)
(u'seems', 5)
(u'lets', 5)
(u'bigger', 5)
(u'outsource', 5)
(u'competition', 5)
(u'discussed', 5)
(u'backup', 5)
(u'imag

(u'basics', 12)
(u'budget', 12)
(u'network', 12)
(u'needed', 12)
(u'target', 12)
(u'emails', 12)
(u'everything', 12)
(u'realistic', 12)
(u'equipment', 12)
(u'freelancers', 12)
(u'sense', 12)
(u'scale', 12)
(u'move', 12)
(u'automated', 12)
(u'looking', 13)
(u'enjoy', 13)
(u'net', 13)
(u'tied', 13)
(u'lower', 13)
(u'bad', 13)
(u'interest', 13)
(u'support', 13)
(u'ensure', 13)
(u'happen', 13)
(u'price', 13)
(u'3', 13)
(u'engine', 13)
(u'myself', 13)
(u'ongoing', 13)
(u'5', 13)
(u'coverage', 13)
(u'put', 13)
(u'owners', 13)
(u'questions', 13)
(u'game', 13)
(u'send', 13)
(u'1', 13)
(u'unless', 13)
(u'avoid', 13)
(u'personally', 13)
(u'direct', 13)
(u'purchase', 13)
(u'days', 13)
(u'matter', 13)
(u'yet', 13)
(u'huge', 13)
(u'longer', 13)
(u'toward', 13)
(u'itself', 13)
(u'made', 13)
(u'estimate', 14)
(u'automatically', 14)
(u'effective', 14)
(u'require', 14)
(u'outside', 14)
(u'targeted', 14)
(u'read', 14)
(u'trends', 14)
(u'figure', 14)
(u'fact', 14)
(u'news', 14)
(u'face', 14)
(u'realize',

In [52]:
sc.stop()