# https://www.youtube.com/watch?v=BUWKbCcqgmo&list=PL2IsFZBGM_IHCl9zhRVC1EXTomkEp_1zm&index=13

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [4]:
df_parquet_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\sales_data.parquet'

In [5]:
df_parquet = spark.read.format('parquet').load(df_parquet_path)

In [6]:
df_parquet.show()

+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-11-24 21:00:00|1995601912| 2077350195|Walgreen       11-25| 197.23| 216510442|
|2017-11-24 21:00:00|1734117021|  644879053|unkn    ppd id: 7...|   8.58| 930259917|
|2017-11-24 21:00:00|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|2017-11-24 21:00:00|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|2017-11-24 21:00:00|1734117089| 1898522855| Target        11-25|  66.33|1855530529|
|2017-11-24 21:00:00|1734117117|  997626433|Sears  ppd id: 85...| 298.87| 957346984|
|2017-11-24 21:00:00|1734117123| 1953761884|unkn   ppd id: 15...|  19.55|  45522086|
|2017-11-24 21:00:00|1734117152| 1429095612|Ikea     arc id: ...|   9.39|1268541279|
|2017-11-24 21:00:00|1734117153|  847200066|unkn        Kings...|

In [7]:
df_orc_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\sales_data.orc'

In [8]:
df_orc = spark.read.format('orc').load(df_orc_path)

In [9]:
df_orc.show()

+-------------------+----------+-----------+--------------------+-------+----------+
|      transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+-------------------+----------+-----------+--------------------+-------+----------+
|2017-11-24 19:00:00|1995601912| 2077350195|Walgreen       11-25| 197.23| 216510442|
|2017-11-24 19:00:00|1734117021|  644879053|unkn    ppd id: 7...|   8.58| 930259917|
|2017-11-24 19:00:00|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|2017-11-24 19:00:00|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|2017-11-24 19:00:00|1734117089| 1898522855| Target        11-25|  66.33|1855530529|
|2017-11-24 19:00:00|1734117117|  997626433|Sears  ppd id: 85...| 298.87| 957346984|
|2017-11-24 19:00:00|1734117123| 1953761884|unkn   ppd id: 15...|  19.55|  45522086|
|2017-11-24 19:00:00|1734117152| 1429095612|Ikea     arc id: ...|   9.39|1268541279|
|2017-11-24 19:00:00|1734117153|  847200066|unkn        Kings...|

In [10]:
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return f"Time taken: {(end_time - start_time) * 1000} ms"
    print(inner_get_time())

In [11]:
@get_time
def x():
    df = spark.read.format('parquet').load(df_parquet_path)
    df.count()

Time taken: 801.027774810791 ms


In [12]:
@get_time
def x():
    df = spark.read.format('parquet').load(df_parquet_path)
    df.select('trx_id').count()

Time taken: 314.6545886993408 ms


In [13]:
@get_time
def y():
    df = spark.read.format('orc').load(df_orc_path)
    df.count()

Time taken: 191.78485870361328 ms


In [14]:
@get_time
def y():
    df = spark.read.format('orc').load(df_orc_path)
    df.select('trx_id').count()

Time taken: 173.0647087097168 ms
