In [None]:
# spark caching can be used to pull data sets into a cluster-wide in-memory cahche.
# This is very useful for accessing repeated data, such as querying a small "hot" dataset or when running an iterative algorithm
# there are two ways to persist RDDs in spark
#     cache()
#     persist()

# there are some advantages of RDD caching and persistance mechanism in spark
#     time efficient
#     cost efficient
#     lessen the execution time
    
# STORAGE TYPEs:
#     MEMORY_ONLY
#     MEMORY_AND_DISK
#     MEMORY_ONLY_SER
#     MEMOTY_AND_DISK_SER
#     DISK_ONLY
#     OFF_HEAP
    
    
    

In [None]:
# rdd unpersist

# spark automatically monitors cache usage on each node and drops our old data 
# partition in a least_recently-used(LRU) fashion

# If you would like to manually remove an RDD instead of waiting for it to fall out os the cache,
# use the RDD.unpersist() method


In [1]:
# import the spark context
from pyspark import SparkContext


In [2]:
# create SparkContext
sc=SparkContext.getOrCreate()
sc

22/10/31 09:39:16 WARN Utils: Your hostname, HP-G62 resolves to a loopback address: 127.0.1.1; using 192.168.18.113 instead (on interface enp3s0)
22/10/31 09:39:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/31 09:39:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [42]:
sales_rdd=sc.textFile("sales.csv")
sales_rdd.collect()

['Company,Person,Sales',
 'GOOG,Sam,200',
 'GOOG,Charlie,120',
 'GOOG,Frank,340',
 'MSFT,Tina,600',
 'MSFT,Amy,124',
 'MSFT,Vanessa,243',
 'FB,Carl,870',
 'FB,Sarah,350',
 'APPL,John,250',
 'APPL,Linda, 130',
 'APPL,Mike, 750',
 'APPL, Chris, 350']

In [43]:
type(sales_rdd)

pyspark.rdd.RDD

In [20]:
# getting total time taken for the count before cache enabled/
import timeit as t
start=t.default_timer()
sales_rdd.count()
sales_rdd.min()
sales_rdd.max()
sales_rdd.collect()
end=t.default_timer()
time_elapsed=(end-start)
time_elapsed

0.9322425360001034

In [21]:
sales_rdd.cache()

sales.csv MapPartitionsRDD[13] at textFile at NativeMethodAccessorImpl.java:0

In [22]:
# getting total time taken for the count after cache enabled/
import timeit as t
start=t.default_timer()
sales_rdd.count()
sales_rdd.min()
sales_rdd.max()
sales_rdd.collect()
end=t.default_timer()
time_elapsed=(end-start)
time_elapsed

0.9199051570003576

In [23]:
# unpersist the data from the momory
sales_rdd.unpersist()

sales.csv MapPartitionsRDD[13] at textFile at NativeMethodAccessorImpl.java:0

In [31]:
# getting total time taken for the count after unpersistance
import timeit as t
start=t.default_timer()
sales_rdd.count()
sales_rdd.min()
sales_rdd.max()
sales_rdd.collect()
end=t.default_timer()
time_elapsed=(end-start)
time_elapsed

0.5988162250000642

In [32]:
# loading pyspark for cache() and persist()
import pyspark

In [33]:
#caching the data in memory
sales_rdd.cache()

sales.csv MapPartitionsRDD[13] at textFile at NativeMethodAccessorImpl.java:0

In [35]:
# unpersisting the data from memory
sales_rdd.unpersist()

sales.csv MapPartitionsRDD[13] at textFile at NativeMethodAccessorImpl.java:0

In [36]:
# persisting teh data in disk
sales_rdd.persist(pyspark.StorageLevel.DISK_ONLY)


sales.csv MapPartitionsRDD[13] at textFile at NativeMethodAccessorImpl.java:0

In [37]:
# unpersisting the data from memory
sales_rdd.unpersist()

sales.csv MapPartitionsRDD[13] at textFile at NativeMethodAccessorImpl.java:0

In [45]:
sales_rdd.take(10)

['Company,Person,Sales',
 'GOOG,Sam,200',
 'GOOG,Charlie,120',
 'GOOG,Frank,340',
 'MSFT,Tina,600',
 'MSFT,Amy,124',
 'MSFT,Vanessa,243',
 'FB,Carl,870',
 'FB,Sarah,350',
 'APPL,John,250']