## Cache
Stores the data across worker nodes. It is a lazy operation, so it only takes place once an action is called.

## Persist
It is in essence the same as `cache()`, but it has parameters.


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import mean, sum, max, min, percentile, countDistinct, col

In [0]:
# Point file path
path = '/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv'

# Load Data
df = spark.read.csv(path, header=True)

# Update columns data types
df = df.select( col('carat').cast('float'),
                'cut', 'color', 'clarity',
                col('depth').cast('float'),
                col('table').cast('float'),
                col('price').cast('float'),
                col('x').cast('float'),
                col('y').cast('float'),
                col('z').cast('float')    )

In [0]:
# Importing Struct Types
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType

# Creating a dataframe for Sales Data
my_data1 = [("Premium","PPDQ1","A"),
        ("Very Good","PPDQ2","A"),
        ("Ideal","PPDQ1","B"),
        ("Good","PPDQ6","E"),
        ("Fair","PPDQ3","D")  ]

schema = StructType([ \
    StructField("cut",StringType(),True), \
    StructField("product",StringType(),True), \
    StructField("store_id",StringType(),True)  ])
 
stores = spark.createDataFrame(data=my_data1,schema=schema)

In [0]:
df2 = (
    df.join(stores, on= 'cut', how='left')
)
df2.count()

In [0]:
df2.display()

In [0]:
df3 = (
    df2
    .groupBy('store_id', 'cut')
    .agg( sum('price').alias('total'),
          mean('price').alias('avg_price'),
          F.count('cut').alias('ct'))
)

df3.count()

In [0]:
df2 = (
    df
    .join(stores, on= 'cut', how='left')
    .cache()
)
df2.count()

In [0]:
df3 = (
    df2
    .groupBy('store_id', 'cut')
    .agg( sum('price').alias('total'),
          mean('price').alias('avg_price'),
          F.count('cut').alias('ct'))
)

df3.count()

In [0]:
from pyspark.storagelevel import *

# Persist the data in memory and Disk
df3.persist(StorageLevel.MEMORY_AND_DISK_2)
#df3.persist(StorageLevel.DISK_ONLY)

df3.count()

In [0]:
%sql
clear cache

In [0]:
df3.unpersist()

In [0]:
df = spark.read.format('delta').load('/databricks-datasets/learning-spark-v2/people/people-10m.delta')

df = df.union(df).union(df).union(df).union(df).union(df)
print(f' Total Rows {df.count():,}')

 Total Rows 60,000,000


In [0]:
df2 = (
    df
    .groupBy('gender', F.month('birthDate').alias('birthMonth'))
    .agg(mean('salary').alias('avg_sal'))
)

In [0]:
df3 = df2.filter( col('avg_sal') > 72600)
df3.count()

21

In [0]:
df2 = (
    df
    .groupBy('gender', F.month('birthDate').alias('birthMonth'))
    .agg(mean('salary').alias('avg_sal'))
    .cache()
)
df2.count()

24

In [0]:
df3 = df2.filter( col('avg_sal') > 72600)
df3.count()

21

In [0]:
(45-8)/45

0.8222222222222222