In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.6.tgz
!tar xvf spark-2.4.4-bin-hadoop2.6.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.6"
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="PySpark_dataframe")

# Introduction to Resilient Distributed Datasets (RDD)

In [1]:
import pyspark
sc = pyspark.SparkContext(appName="PySpark_RDD")

ModuleNotFoundError: No module named 'pyspark'

## Creating RDDs. How to do it?
RDD stands for Resilient Distributed Dataset, these are the elements that run and operate on multiple nodes to do parallel processing on a cluster. RDDs are immutable elements, which means once you create an RDD you cannot change it. RDDs are fault tolerant as well, hence in case of any failure, they recover automatically.

* Parallelizing an existing collection of objects
* External datasets:
  * Files in HDFS
  * Objects in Amazon S3 bucket
  * lines in a text le
* From existing RDDs

### Parallelized collection (parallelizing)
* *parallelize()* for creating RDDs from python list

In [None]:
numRDD = sc.parallelize([1, 2, 3, 4])

In [None]:
helloRDD = sc.parallelize("Hello world")

In [None]:
type(helloRDD)

pyspark.rdd.RDD

### From external datasets
* *textFile()* for creating RDDs from external dataset

In [None]:
fileRDD = sc.textFile("sample_data/README.md")

In [None]:
type(fileRDD)

pyspark.rdd.RDD

## Understanding Partitioning in PySpark
* A partition is a logical division of a large distributed data set
* *parallelize()* method

In [None]:
numRDD = sc.parallelize(range(10), 6)

In [None]:
numRDD.getNumPartitions()

6

* textFile() method

In [None]:
fileRDD = sc.textFile("sample_data/README.md", 6)

In [None]:
fileRDD.getNumPartitions()

6

Your Turn:
* Create an RDD named RDD from a list of words.
* Confirm the object created is RDD.


In [None]:
# Create an RDD from a list of words
RDD = sc.____(["Spark", "is", "a", "framework", "for", "Big Data processing"])

# Print out the type of the created object
print("The type of RDD is", ____(RDD))

* Print the file_path.
* Create an RDD named fileRDD from a file_path with the file name README.md.
* Print the type of the fileRDD created.

In [None]:
file_path = "sample_data/README.md"

In [None]:
# Print the file_path
print("The file_path is", ____)

# Create a fileRDD from file_path
fileRDD = sc.____(file_path)

# Check the type of fileRDD
print("The file type of fileRDD is", type(____))

* Find the number of partitions that support fileRDD RDD.
* Create an RDD named fileRDD_part from the file path but create 5 partitions.
* Confirm the number of partitions in the new fileRDD_part RDD.

In [None]:
# Check the number of partitions in fileRDD
print("Number of partitions in fileRDD is", fileRDD.getNumPartitions)

# Create a fileRDD_part from file_path with 5 partitions
fileRDD_part = sc.textFile(____, ____)

# Check the number of partitions in fileRDD_part
print("Number of partitions in fileRDD_part is", fileRDD_part.____)

## Spark operations
spark_operations = Transformations + Actions
* Transformations create new RDDS
* Actions perform computation on the RDDs

### RDD Transformations

#### map() Transformation
map() transformation applies a function to all elements in the RDD

In [None]:
RDD = sc.parallelize([1, 2, 3, 4])
RDD_map = RDD.map(lambda x: x * x)
# Action
RDD_map.collect()

[1, 4, 9, 16]

#### filter() Transformation
Filter transformation returns a new RDD with only the elements that pass the condition

In [None]:
RDD = sc.parallelize([1, 2, 3, 4])
RDD_filter = RDD.filter(lambda x: x > 2)
# Action
RDD_filter.collect()

[3, 4]

#### flatMap()
* flatMap() transformation returns multiple values for each element in the original RDD

In [None]:
RDD = sc.parallelize(["hello World", "How are you"])
RDD_flatmap = RDD.flatMap(lambda x: x.split(" "))
# Action
RDD_flatmap.collect()

['hello', 'World', 'How', 'are', 'you']

#### Union ()

In [None]:
inputRDD = sc.textFile("logs.txt")
errorRDD = inputRDD.filter(lambda x: "error" in x.split())
warningsRDD = inputRDD.filter(lambda x: "warnings" in x.split())
combinedRDD = errorRDD.union(warningsRDD)

### RDD Actions

* Operation return a value after running a computation on the RDD
* BasicRDD Actions
  * collect()
  * take(N)
  * first()
  * count()

#### collect() and take() Actions
* collect() return allthe elements ofthe dataset as an array
* take(N) returns an array with the rst N elements ofthe dataset

In [None]:
RDD_map.collect()

[1, 4, 9, 16]

In [None]:
RDD_map.take(2)

[1, 4]

#### first() and count() Actions
* first() prints the first element ofthe RDD

In [None]:
RDD_map.first()

1

In [None]:
RDD_flatmap.count()

5

### Your Turn

In [None]:
numbRDD = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

* Create map() transformation that cubes all of the numbers in numbRDD.
* Collect the results in a numbers_all variable.
* Print the output from numbers_all variable.

In [None]:
# Create map() transformation to cube numbers
cubedRDD = numbRDD.map(lambda x: ____)

# Collect the results
numbers_all = cubedRDD.____()

# Print the numbers from numbers_all
for numb in ____:
	print(____)

* Create filter() transformation to select the lines containing the keyword Spark.
* How many lines in fileRDD_filter contains the keyword Spark?
* Print the first four lines of the resulting RDD.

In [None]:
fileRDD = sc.textFile('Pyspark_RDD_q1.txt')

In [None]:
# Filter the fileRDD to select lines with Spark keyword
fileRDD_filter = fileRDD.filter(lambda line: 'Spark' in ____)

# How many lines are there in fileRDD?
print("The total number of lines with the keyword Spark is", fileRDD_filter.____())

# Print the first four lines of fileRDD
for line in fileRDD_filter.____(____): 
  print(line)

In [None]:
# Filter the fileRDD to select lines with Spark keyword
fileRDD_filter = fileRDD.filter(lambda line: 'Spark' in line)

# How many lines are there in fileRDD?
print("The total number of lines with the keyword Spark is", fileRDD_filter.count())

# Print the first four lines of fileRDD
for line in fileRDD_filter.take(4): 
  print(line)

The total number of lines with the keyword Spark is 7
 'Examples for Learning Spark',
 'Examples for the Learning Spark book. These examples require a number of libraries and as such have long build files. We have also added a stand alone example with minimal dependencies and a small build file',
 'These examples have been updated to run against Spark 1.3 so they may',
 'be slightly different than the versions in your copy of "Learning Spark".',


## Pair RDDs

### Introduction to pair RDDs in PySpark
* Real life datasets are usually key/value pairs
* Each row is a key and maps to one or more values
* PairRDD is a special data structure to work with this kind of datasets
* PairRDD: Key is the identier and value is data

### Creating pair RDDs
* Two common ways to create pairRDDs
  * From a list of key-value tuple
  * From a regularRDD
* Get the data into key/value form for paired RDD

In [None]:
my_tuple = [('Sam', 23), ('Mary', 34), ('Peter', 25)]
pairRDD_tuple = sc.parallelize(my_tuple)
pairRDD_tuple.collect()

[('Sam', 23), ('Mary', 34), ('Peter', 25)]

In [None]:
my_list = ['Sam 23', 'Mary 34','Peter 25']
regularRDD = sc.parallelize(my_list)
pairRDD_RDD = regularRDD.map(lambda s: (s.split(' ')[0], s.split(' ')[1]))
pairRDD_RDD.collect()

[('Sam', '23'), ('Mary', '34'), ('Peter', '25')]

### Transformations on pair RDDs
* All regular transformations work on pairRDD
* Have to pass functions that operate on key value pairs rather than on individual elements
* Examples of paired RDD Transformations
  * reduceByKey(func): Combine values with the same key
  * groupByKey(): Group values with the same key
  * sortByKey(): Return an RDD sorted by the key
  * join(): Join two pairRDDs based on their key

#### reduceByKey() transformation
* reduceByKey() transformation combines values with the same key
* It runs parallel operations for each key in the dataset
* It is a transformation and not action

In [None]:
regularRDD = sc.parallelize([("Messi", 23), ("Ronaldo", 34), ("Neymar", 22), ("Messi", 24), ("Messi", 24)])
pairRDD_reducebykey = regularRDD.reduceByKey(lambda x,y : x + y)
pairRDD_reducebykey.collect()

[('Ronaldo', 34), ('Messi', 71), ('Neymar', 22)]

#### sortByKey() transformation
* sortByKey() operation orders pairRDD by key
* It returns an RDD sorted by key in ascending or descending order

In [None]:
pairRDD_reducebykey_rev = pairRDD_reducebykey.map(lambda x: (x[1], x[0]))
pairRDD_reducebykey_rev.sortByKey(ascending=False).collect()


[(71, 'Messi'), (34, 'Ronaldo'), (22, 'Neymar')]

#### groupByKey() transformation
* groupbykey() groups all the values with the same key in the pair RDD

In [None]:
airports = [("US","JFK"),("UK","LHR"),("FR","CDG"),("US","SFO")]
regularRDD = sc.parallelize(airports)
pairRDD_group = regularRDD.groupByKey().collect()
for cont, air in pairRDD_group:
  print(cont, list(air))

US ['JFK', 'SFO']
UK ['LHR']
FR ['CDG']


#### join() transformation
* join() transformation joins the two pairRDDs based on their key

In [None]:
RDD1 = sc.parallelize([("Messi", 34),("Ronaldo", 32),("Neymar", 24)])
RDD2 = sc.parallelize([("Ronaldo", 80),("Neymar", 120),("Messi", 100)])

In [None]:
RDD1.join(RDD2).collect()

[('Neymar', (24, 120)), ('Ronaldo', (32, 80)), ('Messi', (34, 100))]

### Your Turn

* Create a pair RDD named Rdd with tuples (1,2),(3,4),(3,6),(4,5).
* Transform the Rdd with reduceByKey() into a pair RDD Rdd_Reduced by adding the values with the same key.
* Collect the contents of pair RDD Rdd_Reduced and iterate to print the output.

In [None]:
# Create PairRDD Rdd with key value pairs
Rdd = sc.parallelize([____])

# Apply reduceByKey() operation on Rdd
Rdd_Reduced = Rdd.reduceByKey(lambda x, y: ____)

# Iterate over the result and print the output
for num in Rdd_Reduced.____: 
  print("Key {} has {} Counts".format(____, num[1]))

In [None]:
# Create PairRDD Rdd with key value pairs
Rdd = sc.parallelize([(1,2),(3,4),(3,6),(4,5)])

# Apply reduceByKey() operation on Rdd
Rdd_Reduced = Rdd.reduceByKey(lambda x, y: x + y)

# Iterate over the result and print the output
for num in Rdd_Reduced.collect(): 
  print("Key {} has {} Counts".format(num[0], num[1]))

* Sort the Rdd_Reduced RDD using the key in descending order.
* Collect the contents and iterate to print the output.

In [None]:
# Sort the reduced RDD with the key by descending order
Rdd_Reduced_Sort = Rdd_Reduced.____(ascending=False)

# Iterate over the result and print the output
for num in Rdd_Reduced_Sort.____():
  print("Key {} has {} Counts".format(____, num[1]))

In [None]:
# Sort the reduced RDD with the key by descending order
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=False)

# Iterate over the result and print the output
for num in Rdd_Reduced_Sort.collect():
  print("Key {} has {} Counts".format(num[0], num[1]))

## Advanced RDD Actions

### reduce() action
* reduce(func) action is used for aggregating the elements of a regularRDD
* The function should be commutative (changing the order of the operands does not change the result) and associative
* An example of reduce() action in PySpark

In [None]:
x = [1,3,4,6]
RDD = sc.parallelize(x)
RDD.reduce(lambda x, y : x + y)

14

### saveAsTextFile() action
* saveAsTextFile() action saves RDD into a text file inside a directory with each partition as a separate file

In [None]:
RDD.saveAsTextFile("tempFile")

* coalesce() method can be used to save RDD as a single text file

In [None]:
RDD.coalesce(1).saveAsTextFile("tempFile2")

### Action Operation on pair RDDs
* RDD actions available for PySpark pairRDDs
* PairRDD actions leverage the key-value data
* Few examples of pairRDD actions include
  * countByKey()
  * collectAsMap()

#### countByKey() action
* countByKey() only available for type (K,V)
* countByKey() action counts the number of elements for each key
* countByKey --> returns a dictionary
* Example of countByKey() on a simple list

In [None]:
rdd = sc.parallelize([("a", 5), ("b", 1), ("a", 1)])
for kee, val in rdd.countByKey().items():
  print(kee, val)

a 2
b 1


#### collectAsMap() action
* collectAsMap() return the key-value pairs in the RDD as a dictionary
* Example of collectAsMap() on a simple tuple

In [None]:
sc.parallelize([(1, 2), (3, 4)]).collectAsMap()

{1: 2, 3: 4}

## Your Turn:
* Use the countByKey() action on the Rdd to count the unique keys and assign the result to a variable total.
* What is the type of total?
* Iterate over the total and print the keys and their counts.

In [None]:
RDD = sc.parallelize([(1, 2), (3, 4), (3, 6), (4, 5)])

In [None]:
# Transform the rdd with countByKey()
total = Rdd.____()

# What is the type of total?
print("The type of total is", type(____))

# Iterate over the total and print the output
for k, v in total.____(): 
  print("key", ____, "has", ____, "counts")

In [None]:
# Transform the rdd with countByKey()
total = Rdd.countByKey()

# What is the type of total?
print("The type of total is", type(total))

# Iterate over the total and print the output
for k, v in total.items(): 
  print("key", k, "has", v, "counts")

### William Shakespeare
The volume of unstructured data (log lines, images, binary files) in existence is growing dramatically, and PySpark is an excellent framework for analyzing this type of data through RDDs. 

In this 3 part exercise, you will write code that calculates the most common words from Complete Works of William Shakespeare.

Here are the brief steps for writing the word counting program:

* Create a base RDD from Complete_Shakespeare.txt file.
* Use RDD transformation to create a long list of words from each element of the base RDD.
* Remove stop words from your data.
* Create pair RDD where each element is a pair tuple of ('w', 1)
* Group the elements of the pair RDD by key (word) and add up their values.
* Swap the keys (word) and values (counts) so that keys is count and value is the word.
* Finally, sort the RDD by descending order and print the 10 most frequent words and their frequencies.

In this first exercise, you'll create a base RDD from Complete_Shakespeare.txt file and transform it to create a long list of words.

In [None]:
file_path = 'Complete_Shakespeare.txt'

* Create an RDD called baseRDD that reads lines from file_path.
* Transform the baseRDD into a long list of words and create a new splitRDD.
* Count the total words in splitRDD.

In [None]:
# Create a baseRDD from the file path
baseRDD = ____(file_path)

# Split the lines of baseRDD into words
splitRDD = baseRDD.____(lambda x: x.____())

# Count the total number of words
print("Total number of words in splitRDD:", splitRDD.____())

In [None]:
# Create a baseRDD from the file path
baseRDD = sc.textFile(file_path)

# Split the lines of baseRDD into words
splitRDD = baseRDD.flatMap(lambda x: x.split())

# Count the total number of words
print("Total number of words in splitRDD:", splitRDD.count())

### Remove stop words (reduce the dataset)
Stop words are common words that are often uninteresting. For example "I", "the", "a" etc., are stop words.

In [None]:
stop_words  = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now']

* Convert the words in splitRDD in lower case and then remove stop words from stop_words.
* Create a pair RDD tuple containing the word and the number 1 from each word element in splitRDD.
* Get the count of the number of occurrences of each word (word frequency) in the pair RDD using reduceByKey()

In [None]:
# Convert the words in lower case and remove stop words from stop_words
splitRDD_no_stop = splitRDD.____(lambda x: x.lower() not in ____)

# Create a tuple of the word and 1 
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (____, ____))

# Count of the number of occurences of each word
resultRDD = splitRDD_no_stop_words.____(lambda x, y: x + y)

In [None]:
# Convert the words in lower case and remove stop words from stop_words
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_words)

# Create a tuple of the word and 1 
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))

# Count of the number of occurences of each word
resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)

### Print word frequencies

* Print the first 10 words and their frequencies from the resultRDD.
* Swap the keys and values in the resultRDD.
* Sort the keys according to descending order.
* Print the top 10 most frequent words and their frequencies.

In [None]:
# Display the first 10 words and their frequencies
for word in resultRDD.____(10):
	print(word)

# Swap the keys and values 
resultRDD_swap = resultRDD.____(lambda x: (x[____], x[____]))

# Sort the keys in descending order
resultRDD_swap_sort = resultRDD_swap.____(ascending=False)

# Show the top 10 most frequent words and their frequencies
for word in resultRDD_swap_sort.____(____):
	print("{} has {} counts". format(____, word[0]))

In [None]:
# Display the first 10 words and their frequencies
for word in resultRDD.take(10):
	print(word)

# Swap the keys and values 
resultRDD_swap = resultRDD.map(lambda x: (x[1], x[0]))

# Sort the keys in descending order
resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)

# Show the top 10 most frequent words and their frequencies
for word in resultRDD_swap_sort.take(10):
	print("{} has {} counts". format(word[1], word[0]))