In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("SparkCaching2"). \
          config("spark.sql.warehouse.dir","/user/itv012857/warehouse"). \
          enableHiveSupport(). \
          master("yarn"). \
          getOrCreate()

In [3]:
! hadoop fs -head /public/trendytech/datasets/hotel_data.csv

1,John Doe,2023-05-01,2023-05-05,Standard,400.0
2,Jane Smith,2023-05-02,2023-05-06,Deluxe,600.0
3,Mark Johnson,2023-05-03,2023-05-08,Standard,450.0
4,Sarah Wilson,2023-05-04,2023-05-07,Executive,750.0
5,Emily Brown,2023-05-06,2023-05-09,Deluxe,550.0
6,Michael Davis,2023-05-07,2023-05-10,Standard,400.0
7,Samantha Thompson,2023-05-08,2023-05-12,Deluxe,600.0
8,William Lee,2023-05-10,2023-05-13,Standard,450.0
9,Amanda Harris,2023-05-11,2023-05-16,Executive,750.0
10,David Rodriguez,2023-05-12,2023-05-15,Deluxe,550.0
11,Linda Wilson,2023-05-14,2023-05-18,Standard,400.0
12,Robert Johnson,2023-05-15,2023-05-20,Deluxe,600.0
13,Sophia Anderson,2023-05-16,2023-05-21,Standard,450.0
14,James Smith,2023-05-17,2023-05-23,Executive,750.0
15,Olivia Brown,2023-05-19,2023-05-24,Deluxe,550.0
16,Michael Davis,2023-05-20,2023-05-25,Standard,400.0
17,Emily Thompson,2023-05-21,2023-05-27,Deluxe,600.0
18,William Lee,2023-05-23,2023-05-28,Standard,450.0
19,Ava Harris,2023-05-24,2023-05-30,Executive,750.0
20,Dan

In [4]:
! hadoop fs -ls -h /public/trendytech/datasets/hotel_data.csv

-rw-r--r--   3 itv005857 supergroup      5.6 K 2023-06-05 02:31 /public/trendytech/datasets/hotel_data.csv


### A. Testing the impact of caching using Data Frame

In [5]:
hotel_bookings_schema = "booking_id int, guest_name string, checkin_date date, checkout_date date, room_type string, total_price double"

In [6]:
hotel_bookings = spark.read.csv("/public/trendytech/datasets/hotel_data.csv", schema = hotel_bookings_schema)

In [7]:
hotel_bookings.show()

+----------+-----------------+------------+-------------+---------+-----------+
|booking_id|       guest_name|checkin_date|checkout_date|room_type|total_price|
+----------+-----------------+------------+-------------+---------+-----------+
|         1|         John Doe|  2023-05-01|   2023-05-05| Standard|      400.0|
|         2|       Jane Smith|  2023-05-02|   2023-05-06|   Deluxe|      600.0|
|         3|     Mark Johnson|  2023-05-03|   2023-05-08| Standard|      450.0|
|         4|     Sarah Wilson|  2023-05-04|   2023-05-07|Executive|      750.0|
|         5|      Emily Brown|  2023-05-06|   2023-05-09|   Deluxe|      550.0|
|         6|    Michael Davis|  2023-05-07|   2023-05-10| Standard|      400.0|
|         7|Samantha Thompson|  2023-05-08|   2023-05-12|   Deluxe|      600.0|
|         8|      William Lee|  2023-05-10|   2023-05-13| Standard|      450.0|
|         9|    Amanda Harris|  2023-05-11|   2023-05-16|Executive|      750.0|
|        10|  David Rodriguez|  2023-05-

In [8]:
hotel_bookings.printSchema()

root
 |-- booking_id: integer (nullable = true)
 |-- guest_name: string (nullable = true)
 |-- checkin_date: date (nullable = true)
 |-- checkout_date: date (nullable = true)
 |-- room_type: string (nullable = true)
 |-- total_price: double (nullable = true)



#### 1. Total number of bookings

In [9]:
hotel_bookings.count()

# Took 1 sec

107

In [10]:
hotel_bookings_cached = hotel_bookings.cache()

In [11]:
hotel_bookings_cached.count()

# First time - took 0.2 secs to cache and count

107

In [12]:
hotel_bookings_cached.count()

# Second time - took only 41 msecs to read from cache and count

107

#### 2. Average Price by room type

In [17]:
hotel_bookings.groupBy("room_type").avg("total_price").show()

# Took 0.4 secs

+---------+-----------------+
|room_type| avg(total_price)|
+---------+-----------------+
|Executive|            750.0|
|   Deluxe|575.5813953488372|
| Standard|            425.0|
+---------+-----------------+



### B. Caching Spark External table

In [25]:
spark.sql("show databases").filter("namespace like '%12857%'")

namespace
itv012857_db


In [27]:
spark.sql("use itv012857_db")

In [29]:
spark.sql("""CREATE TABLE hotel_bookings(booking_id int, guest_name string, checkin_date date,
                checkout_date date, room_type string, total_price double)
                USING CSV
                LOCATION '/public/trendytech/datasets/hotel_data.csv'
                """)

In [30]:
spark.sql("""SELECT * FROM hotel_bookings""")

booking_id,guest_name,checkin_date,checkout_date,room_type,total_price
1,John Doe,2023-05-01,2023-05-05,Standard,400.0
2,Jane Smith,2023-05-02,2023-05-06,Deluxe,600.0
3,Mark Johnson,2023-05-03,2023-05-08,Standard,450.0
4,Sarah Wilson,2023-05-04,2023-05-07,Executive,750.0
5,Emily Brown,2023-05-06,2023-05-09,Deluxe,550.0
6,Michael Davis,2023-05-07,2023-05-10,Standard,400.0
7,Samantha Thompson,2023-05-08,2023-05-12,Deluxe,600.0
8,William Lee,2023-05-10,2023-05-13,Standard,450.0
9,Amanda Harris,2023-05-11,2023-05-16,Executive,750.0
10,David Rodriguez,2023-05-12,2023-05-15,Deluxe,550.0


In [31]:
spark.sql("""SELECT count(1) FROM hotel_bookings""")

# 74ms

count(1)
107


In [32]:
spark.sql("""SELECT room_type, avg(total_price) 
                FROM hotel_bookings
                GROUP BY room_type
                """)

# 1.7 sec

room_type,avg(total_price)
Executive,750.0
Deluxe,575.5813953488372
Standard,425.0


In [33]:
spark.sql("cache table hotel_bookings")
# 0.3 secs

In [34]:
spark.sql("""SELECT count(1) FROM hotel_bookings""")
# 48ms

count(1)
107


In [35]:
spark.sql("""SELECT room_type, avg(total_price) 
                FROM hotel_bookings
                GROUP BY room_type
                """)

# 1.1 secs

room_type,avg(total_price)
Executive,750.0
Deluxe,575.5813953488372
Standard,425.0


In [36]:
spark.sql("uncache table hotel_bookings ")

In [37]:
spark.stop()