In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [3]:
retailAll = '../data/retail-data/all/'
flightData2010 = '../data/flight-data/parquet/2010-summary.parquet'

### Resilient Distributed Datasets (RDDs)

-  **Spak operates on a per-partition basis when executing code**
-  Basically all DF Spark code compiles down to an RDD
-  When calling a DF transformation, the underlying logic becomes a set of RDD transformations
-  SparkContext is the entry point for low-level API functionality accessed through SparkSession
-  Main reason to use RDDs is for fine grained control over physical distribution of data (**custom partitioning of data**)
-  RDD performance is best via Scala/Java
<br>
-  RDDs:
    -  Caching:
        -  ability to cache or persists an RDD
        -  ability to specify a storage level [org.apache.spark.storage.StorageLevel] (combinations of memory only, disk only, and off heap)
    -  Checkpointing:
        -  saves RDD to risk so future computations on that RDD point to its partitions on disk rather than recomputing the RDD from the original source
        -  similar to caching except checkpointing is stored only on disk and not in memory (like cache)
        -  when checkpointed RDD is referenced it derives from checkpoint instead of source data, which helps improve performance and optimization
        <br>
-  Shared Variables:
    -  broadcast variables
    -  accumulators

### _RDD to DF Example_

In [3]:
rdd1 = spark.range(3).rdd
rdd1.collect()

[Row(id=0), Row(id=1), Row(id=2)]

In [4]:
for i in spark.range(3).rdd.collect(): print(i)

Row(id=0)
Row(id=1)
Row(id=2)


In [5]:
spark.range(3).rdd.toDF().show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+



### _Local Collection to RDD Example_

In [6]:
myCollection = "Training on Python, Spark, Scala and on ML".split(" ")


In [7]:
myCollection

['Training', 'on', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'ML']

In [8]:
words = spark.sparkContext.parallelize(myCollection, 2) # sets number of partitions

In [9]:
words.getNumPartitions()

2

In [10]:
words.setName("myWords") # names app for Spark UI
for i in words.collect(): print(i)

Training
on
Python,
Spark,
Scala
and
on
ML


### _RDD Data Source Read Example_

### RDD Transformation Examples:

In [11]:
# distinct
print(words.distinct().count())

7


In [12]:
print(words.collect())
print(words.distinct().collect())

['Training', 'on', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'ML']
['Spark,', 'Training', 'on', 'Python,', 'Scala', 'and', 'ML']


In [13]:
# filter
def startsWithS(individual):
  return individual.startswith("S")

print(words.filter(lambda word: startsWithS(word)).collect())


['Spark,', 'Scala']


In [14]:
# map

words2 = words.map(
    lambda word: 
        (
            word, word[0], word.startswith("S")
        )
)

In [15]:
words2.collect()

[('Training', 'T', False),
 ('on', 'o', False),
 ('Python,', 'P', False),
 ('Spark,', 'S', True),
 ('Scala', 'S', True),
 ('and', 'a', False),
 ('on', 'o', False),
 ('ML', 'M', False)]

In [16]:
print(words2.filter(
    lambda record: record[2]
).collect())

[('Spark,', 'S', True), ('Scala', 'S', True)]


In [17]:
# sort
print(words.sortBy(lambda word: len(word) * -1).collect())

['Training', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'on', 'ML']


### RDD Actions Examples

In [18]:
# reduce
print(spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y))

210


In [19]:
words.collect()

['Training', 'on', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'ML']

In [20]:
# countByValue
print(words.countByValue())

defaultdict(<class 'int'>, {'Training': 1, 'on': 2, 'Python,': 1, 'Spark,': 1, 'Scala': 1, 'and': 1, 'ML': 1})


In [21]:
# first
print(words.first())

Training


In [22]:
# take
print(words.take(3)) # returns values
print(words.takeOrdered(3)) # asc order

['Training', 'on', 'Python,']
['ML', 'Python,', 'Scala']


### _RDD TXT Save (Uncompressed & Compressed) Example_

In [23]:
import shutil
#shutil.rmtree("words_atin")

words.saveAsTextFile("words_atin")

In [24]:
import os
for i in os.listdir("words_atin"): print(i)

.part-00000.crc
.part-00001.crc
._SUCCESS.crc
part-00000
part-00001
_SUCCESS


In [25]:
import shutil
#shutil.rmtree("wordsCompressed_atin")

words.saveAsTextFile("wordsCompressed_atin", \
                     compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

In [26]:
import os
for i in os.listdir("wordsCompressed_atin/"): print(i)

.part-00000.gz.crc
.part-00001.gz.crc
._SUCCESS.crc
part-00000.gz
part-00001.gz
_SUCCESS


### _RDD Caching Example_

In [27]:
words.cache()

myWords ParallelCollectionRDD[26] at readRDDFromFile at PythonRDD.scala:274

### _RDD Checkpointing Example_

In [28]:
#shutil.rmtree("checkpointRDD")
spark.sparkContext.setCheckpointDir("checkpointRDD")
words.checkpoint()

## Distributed Shared Variables_

-  Broadcast Variables:
    -  saves large value on all worker nodes without re-sending to cluster every time (ex: lookup table as function that fits in memory on each executor)
    -  avoids deserialization per task on the worker nodes every time variable is used
    -  shared immutable variables that are cached on every machine in cluster instead of serialized with every single task
    -  the cost of serializing data for every task can be quite expensive thus broadcast variables are a good alternative   
<br>
-  Accumulators:
    -  adds data together from all tasks into a shared result (ex: error logging counter and debugging)
    -  mutable variable that updates value via transformations and sends value to driver node in an efficient manner


In [4]:
my_collection = "Corporate Training for - Spark, Scala, Python".split(" ")
words = spark.sparkContext.parallelize(my_collection, 2)

### _Broadcast Example_:

In [6]:
supplementalData = {"ATT":1000, "Corporate":200,
                    "Training":-300, "days":100}

In [9]:
type(supplementalData)

dict

In [10]:
suppBroadcast = spark.sparkContext.broadcast(supplementalData)

In [12]:
suppBroadcast.value

{'ATT': 1000, 'Corporate': 200, 'Training': -300, 'days': 100}

In [14]:
type(suppBroadcast)

pyspark.broadcast.Broadcast

In [15]:
words.collect()

['Corporate', 'Training', 'for', '-', 'Spark,', 'Scala,', 'Python']

In [17]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0))).collect()

[('Corporate', 200),
 ('Training', -300),
 ('for', 0),
 ('-', 0),
 ('Spark,', 0),
 ('Scala,', 0),
 ('Python', 0)]

In [18]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0))).collect()

[('Corporate', 200),
 ('Training', -300),
 ('for', 0),
 ('-', 0),
 ('Spark,', 0),
 ('Scala,', 0),
 ('Python', 0)]

### _Accumulator Example_:

In [34]:
print(flightData2010)

../data/flight-data/parquet/2010-summary.parquet


In [35]:
flights = spark.read.parquet(flightData2010)
flights.show()

AnalysisException: Path does not exist: file:/D:/22-Trngs/2-Confirmed/5-PySpark-56-hours-Geet/GH/Labs/data/flight-data/parquet/2010-summary.parquet

In [None]:
# count number of flights to or from China
accChina = spark.sparkContext.accumulator(0)

def accChinaFunc(flight_row):
  destination = flight_row["DEST_COUNTRY_NAME"]
  origin = flight_row["ORIGIN_COUNTRY_NAME"]
  if destination == "China":
    accChina.add(flight_row["count"])
  if origin == "China":
    accChina.add(flight_row["count"])

# runs foreach row in the input DF (flights) and runs function against each row
flights.foreach(lambda flight_row: accChinaFunc(flight_row))

In [None]:
accChina.value

In [None]:
myCollection = "Python Spark PySpark DataBricks Advanced Training"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)

In [None]:
words.map(lambda word: (word.lower(), 1)).collect()

In [None]:
keyword = words.keyBy(lambda word: word.lower()[0])
keyword.collect()

In [None]:
keyword.mapValues(lambda word: word.upper()).collect()