# <center> Introduction to Spark In-memory Computing via Python PySpark </center>

## 1. Getting Started

Spark stores data in memory. This memory space is represented by the variable **sc** (SparkContext). 

In [2]:
# make sure kernel is pyspark
import sys
import os
import pyspark

In [3]:
print(os.environ['SPARK_ROOT'])
print(os.environ['SPARK_CONFIG_FILE'])
print(os.environ['SPARK_ROOT'])
print(os.environ['SPARK_MASTER_HOST'])
print(os.environ['SPARK_MASTER_PORT'])
print(os.environ['SPARK_MASTER_WEBUI_PORT'])

/software/spackages/linux-rocky8-x86_64/gcc-9.5.0/spark-3.1.1-3takuhnpd3av65aoge5ark5gligt6usb
/home/amann3/ondemand/data/sys/dashboard/batch_connect/sys/ood_jupyter_spark/output/04a1b626-170a-4ead-8f11-0b0661301574/spark-defaults.conf
/software/spackages/linux-rocky8-x86_64/gcc-9.5.0/spark-3.1.1-3takuhnpd3av65aoge5ark5gligt6usb
node0655.palmetto.clemson.edu
6304
3727


In [4]:
print(sc.getConf().getAll())

[('spark.app.startTime', '1685545238563'), ('spark.master', 'spark://node0655.palmetto.clemson.edu:6304'), ('spark.driver.memory', '2G'), ('spark.executor.memory', '13G'), ('spark.driver.port', '33247'), ('spark.executor.id', 'driver'), ('spark.app.name', 'pyspark-shell'), ('spark.ui.killEnabled', 'false'), ('spark.ui.proxyBase', '/proxy/app-20230531110040-0004'), ('spark.authenticate.secret', '2We2sHED'), ('spark.rdd.compress', 'True'), ('spark.ui.enabled', 'false'), ('spark.serializer.objectStreamReset', '100'), ('spark.driver.maxResultSize', '0'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.driver.host', 'node0655.palmetto.clemson.edu'), ('spark.app.id', 'app-20230531110040-0004'), ('spark.ui.showConsoleProgress', 'true'), ('spark.authenticate', 'true'), ('spark.ui.reverseProxy', 'true')]


In [5]:
spark_app = "http://" + os.environ['SPARK_MASTER_HOST'] + ":" + os.environ['SPARK_MASTER_WEBUI_PORT'] + "/api/v1/applications"
print(spark_app)

http://node0655.palmetto.clemson.edu:3727/api/v1/applications


In [6]:
textFile = sc.textFile("/zfs/citi/complete-shakespeare.txt")

In [7]:
# prints pointer to the data somewhere in the spark cluster
print (textFile)

/zfs/citi/complete-shakespeare.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0


## 2. WordCount

In [8]:
textFile = sc.textFile("/zfs/citi/complete-shakespeare.txt")

In [9]:
textFile

/zfs/citi/complete-shakespeare.txt MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0

In [10]:
%%time
textFile.count()

[Stage 0:>                                                          (0 + 2) / 2]

CPU times: user 14.6 ms, sys: 4.06 ms, total: 18.6 ms
Wall time: 4.15 s


                                                                                

124796

In [11]:
wordcount = textFile.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda a, b: a + b)

In [12]:
wordcount

PythonRDD[9] at RDD at PythonRDD.scala:53

In [13]:
!rm -r ~/intro-to-spark
!mkdir ~/intro-to-spark

rm: cannot remove '/home/amann3/intro-to-spark': No such file or directory


In [14]:
wordcount.saveAsTextFile(os.path.join(os.environ['HOME'],"intro-to-spark/output-wordcount-01"))

                                                                                

In [15]:
!ls -l ~/intro-to-spark/output-wordcount-01

total 633
-rw-r--r-- 1 amann3 cuuser 534417 May 31 11:03 part-00000
-rw-r--r-- 1 amann3 cuuser 525830 May 31 11:03 part-00001
-rw-r--r-- 1 amann3 cuuser      0 May 31 11:03 _SUCCESS


In [16]:
!cat ~/intro-to-spark/output-wordcount-01/part-00000 \
    2>/dev/null | head -n 20

('The', 3977)
('Project', 85)
('EBook', 2)
('of', 15649)
('', 506672)
('Shakespeare', 45)
('is', 7874)
('use', 266)
('anyone', 4)
('anywhere', 4)
('at', 2227)
('no', 2439)
('restrictions', 2)
('whatsoever.', 3)
('may', 1341)
('it,', 529)
('give', 964)
('away', 294)
('re-use', 2)
('this', 4809)


**Step-by-step actions:**

In [17]:
!cat /zfs/citi/complete-shakespeare.txt \
    2>/dev/null | head -n 100

ï»¿The Project Gutenberg EBook of The Complete Works of William Shakespeare, by 
William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org

** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **
**     Please follow the copyright guidelines in this file.     **

Title: The Complete Works of William Shakespeare

Author: William Shakespeare

Posting Date: September 1, 2011 [EBook #100]
Release Date: January, 1994

Language: English


*** START OF THIS PROJECT GUTENBERG EBOOK COMPLETE WORKS--WILLIAM SHAKESPEARE ***




Produced by World Library, Inc., from their Library of the Future




This is the 100th Etext file presented by Project Gutenberg, and
is presented in cooperation with World Library, Inc., from their
Library of the Future and Shakespeare CDROMS.  Project

In [18]:
wordcount_step_01 = textFile.flatMap(lambda line: line.split(" "))

In [19]:
tmp = textFile.map(lambda line: line.split(" "))
tmp.take(20)

[['The',
  'Project',
  'Gutenberg',
  'EBook',
  'of',
  'The',
  'Complete',
  'Works',
  'of',
  'William',
  'Shakespeare,',
  'by',
  ''],
 ['William', 'Shakespeare'],
 [''],
 ['This',
  'eBook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever.',
  '',
  'You',
  'may',
  'copy',
  'it,',
  'give',
  'it',
  'away',
  'or'],
 ['re-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'Project',
  'Gutenberg',
  'License',
  'included'],
 ['with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.org'],
 [''],
 ['**',
  'This',
  'is',
  'a',
  'COPYRIGHTED',
  'Project',
  'Gutenberg',
  'eBook,',
  'Details',
  'Below',
  '**'],
 ['**',
  '',
  '',
  '',
  '',
  'Please',
  'follow',
  'the',
  'copyright',
  'guidelines',
  'in',
  'this',
  'file.',
  '',
  '',
  '',
  '',
  '**'],
 [''],
 ['Title:', 'The', 'Complete', 'Works', 'of', 'William', 'S

In [20]:
wordcount_step_01

PythonRDD[14] at RDD at PythonRDD.scala:53

In [21]:
wordcount_step_01.take(20)

['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'The',
 'Complete',
 'Works',
 'of',
 'William',
 'Shakespeare,',
 'by',
 '',
 'William',
 'Shakespeare',
 '',
 'This',
 'eBook',
 'is',
 'for']

In [22]:
wordcount_step_02 = wordcount_step_01.map(lambda word: (word, 1))

In [23]:
wordcount_step_02.take(20)

[('The', 1),
 ('Project', 1),
 ('Gutenberg', 1),
 ('EBook', 1),
 ('of', 1),
 ('The', 1),
 ('Complete', 1),
 ('Works', 1),
 ('of', 1),
 ('William', 1),
 ('Shakespeare,', 1),
 ('by', 1),
 ('', 1),
 ('William', 1),
 ('Shakespeare', 1),
 ('', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1)]

In [24]:
wordcount_step_03 = wordcount_step_02.reduceByKey(lambda a, b: a + b)

In [26]:
wordcount_step_03.take(20)

[('', 506672),
 ('yard.', 3),
 ('mouse!', 1),
 ('Peace,', 83),
 ('peace;', 27),
 ('this', 4809),
 ('of', 15649),
 ('toasted', 3),
 ("There's", 216),
 ('gauntlet;', 1),
 ('giant.', 3),
 ('flown,', 1),
 ('bird!', 3),
 ("i'", 310),
 ('clout,', 1),
 ('clout!', 1),
 ('Hewgh!', 1),
 ('Give', 321),
 ('Edg.', 98),
 ('marjoram.', 1)]

### Challenge

- Augment the mapping process of WordCount with a function to filter out punctuations and capitalization from the unique words
  - Hint: The string module is helpful for removing punctuation.
  - Make sure your solution supports Python version 3.

In [28]:
import string

In [34]:
translator = str.maketrans('','',string.punctuation)
wordcount_enhanced = textFile.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word.translate(translator).lower(), 1)) \
            .reduceByKey(lambda a, b: a + b)
print(wordcount_enhanced.take(50))

[Stage 12:>                                                         (0 + 2) / 2]

[('project', 329), ('gutenberg', 257), ('ebook', 16), ('of', 18289), ('shakespeare', 272), ('', 507028), ('this', 6894), ('is', 9621), ('use', 560), ('anyone', 7), ('anywhere', 8), ('at', 2522), ('no', 3807), ('restrictions', 2), ('whatsoever', 17), ('may', 1880), ('give', 1335), ('away', 859), ('reuse', 2), ('online', 4), ('details', 1), ('below', 58), ('copyright', 243), ('guidelines', 1), ('in', 11027), ('title', 88), ('posting', 5), ('1', 311), ('2011', 1), ('100', 2), ('january', 3), ('1994', 1), ('language', 37), ('start', 35), ('workswilliam', 2), ('produced', 4), ('world', 848), ('library', 233), ('inc', 224), ('presented', 18), ('cooperation', 1), ('cdroms', 1), ('are', 3874), ('placed', 10), ('public', 66), ('domain', 12), ('certain', 178), ('implications', 1), ('read', 205), ('version', 222)]


                                                                                

In [None]:
# wordcount = textFile.flatMap(lambda line: line.split(" ")) \
#             .map(lambda word: (word, 1)) \
#             .reduceByKey(lambda a, b: a + b)