## Create conda env with the same python version 3.13 than local installation to avoid having different python versions at driver VS executors

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions

### Create SparkSession

In [None]:
spark = SparkSession \
    .builder \
    .appName("Sample serialization formats") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()


### Init data

In [3]:
data = [("Alice", 25),("John", 15),("Eva", 35)]
columns = ["name", "age"]
df = spark.createDataFrame(data=data, schema=columns)

### Parquet

In [None]:
# Writting
df.write.parquet(path="sample.parquet", mode='overwrite')

# Reading
parquet_df = spark.read.parquet("./sample.parquet")
parquet_df.show(truncate=False)

### ORC

In [None]:
# Writing
df.write.mode("overwrite").orc("sample.orc")

# Reading
orc_df = spark.read.orc(path="sample.orc")
orc_df.show(truncate=False)

### Avro

In [None]:
# Writing
df.write.mode("overwrite").format("avro").save("sample.avro")

# Reading
avro_df = spark.read.format("avro").load("sample.avro")
avro_df.show(truncate=False)