In [1]:
data = [
    ("ORD001","Delhi","Laptop",45000,"2024-01-05"),
    ("ORD002","Mumbai","Mobile",32000,"2024-01-06"),
    ("ORD003","Bangalore","Tablet",30000,"2024-01-07"),
    ("ORD004","Delhi","Laptop",55000,"2024-01-08"),
    ("ORD005","Mumbai","Tablet",34000,"2024-01-09")
]

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

#Parquet is a data format used when
##1. Dataset is large
##2. Column wise compression is required during query processing

In [4]:
columns=["order_id", "city", "product", "price", 'order_data']

df=spark.createDataFrame(data, columns)
df.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_data|
+--------+---------+-------+-----+----------+
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
+--------+---------+-------+-----+----------+



In [5]:
df.write.mode("overwrite").parquet("data/parquet/orders")

In [6]:
df_parquet=spark.read.parquet("data/parquet/orders")
df_parquet.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_data|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+



# ORC
## Used: When data requires very high compression
## Example: Telecom/Insurance/Banking, etc historical data that is in petabytes

In [7]:
df.write.mode("overwrite").orc("data/orc/orders")

In [8]:
df_orc=spark.read.orc("data/orc/orders")
df_orc.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_data|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+



# AVRO

In [9]:
!pip uninstall -y pyspark
!pip install pyspark==3.5.1

Found existing installation: pyspark 4.0.1
Uninstalling pyspark-4.0.1:
  Successfully uninstalled pyspark-4.0.1
Collecting pyspark==3.5.1
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.7 (from pyspark==3.5.1)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=f8b2dc57e84e7d774abcfb968a06210b6c05dea9207781dfb1b913fb7abcffc5
  Stored in directory: /root/.cache/pip/wheels/b1/91/5f/283b53010a801

In [1]:
data = [
    ("ORD001","Delhi","Laptop",45000,"2024-01-05"),
    ("ORD002","Mumbai","Mobile",32000,"2024-01-06"),
    ("ORD003","Bangalore","Tablet",30000,"2024-01-07"),
    ("ORD004","Delhi","Laptop",55000,"2024-01-08"),
    ("ORD005","Mumbai","Tablet",34000,"2024-01-09")
]

columns=["order_id", "city", "product", "price", 'order_data']

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
.appName("AvroStable")\
.config("spark.jars.packages",
        "org.apache.spark:spark-avro_2.12:3.5.1")\
.getOrCreate()

In [3]:
df=spark.createDataFrame(data, columns)

In [4]:
df.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_data|
+--------+---------+-------+-----+----------+
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
+--------+---------+-------+-----+----------+



In [5]:
df.write.format("avro").mode("overwrite").save("/content/avro_out")