In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

spark = SparkSession.builder.appName("data formats").config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0").getOrCreate()

In [4]:
data = [
    ("ORD001","Delhi","Laptop",45000,"2024-01-05"),
    ("ORD002","Mumbai","Mobile",32000,"2024-01-06"),
    ("ORD003","Bangalore","Tablet",30000,"2024-01-07"),
    ("ORD004","Delhi","Laptop",55000,"2024-01-08"),
    ("ORD005","Mumbai","Tablet",34000,"2024-01-09")
]

columns=["order_id","city","product","price","order_date"]


df=spark.createDataFrame(data,columns)
df.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
+--------+---------+-------+-----+----------+



In [5]:
df.write.mode("overwrite").parquet("data/parquet/orders")

In [6]:
df_parquet = spark.read.parquet("data/parquet/orders")
df_parquet.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+




parquet fails in historical data, it excels in columnar data

orc is useful in historical data and fast calculation

In [7]:
df.write.mode("overwrite").orc("data/orc/orders")

In [8]:
df_orc=spark.read.orc("data/orc/orders")
df_orc.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+



In [9]:
df.write.mode("overwrite").format("avro").save("data/avro/orders")

df_avro=spark.read.format("avro").load("data/avro/orders")
df_avro.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+

