<a href="https://colab.research.google.com/github/anaferreira744/DE-DP-ADF/blob/main/spark/examples/06-write_partitioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Write
- .write
- .format (parquet, csv, json)
- options
- spark.sql.sources.partitionOverwriteMode dynamic

# Write Mode
- overwrite - The overwrite mode is used to overwrite the existing file, alternatively, you can use SaveMode.Overwrite
- append - To add the data to the existing file, alternatively, you can use SaveMode.Append
- ignore - Ignores write operation when the file already exists, alternatively, you can use SaveMode.Ignore.
- errorifexists or error - This is a default option when the file already exists, it returns an error, alternatively, you can use SaveMode.ErrorIfExists.

# Partitioning
Process to organize the data into multiple chunks based on some criteria.
Partitions are organized in sub-folders.
Partitioning improves performance in Spark.

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').getOrCreate()

# Preparing data

In [2]:
!pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m1.3/1.9 MB[0m [31m39.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0


In [3]:
from faker import Faker
from datetime import datetime

fake = Faker()

users = []
for _ in range(50):
    user = {
        'date': fake.date_time_between_dates(datetime(2024, 5, 1), datetime(2024, 5, 5)),
        'name': fake.name(),
        'address': fake.address(),
        'email': fake.email(),
        'dob': fake.date_of_birth(),
        'phone': fake.phone_number()
    }
    users.append(user)

df = spark.createDataFrame(users)

df.show(10, False)


+-------------------------------------------------------+--------------------------+----------+---------------------------+----------------+---------------------+
|address                                                |date                      |dob       |email                      |name            |phone                |
+-------------------------------------------------------+--------------------------+----------+---------------------------+----------------+---------------------+
|4332 Miguel Junctions\nBrookstown, PA 36236            |2024-05-01 06:32:51.604388|1936-03-12|jennifercuevas@example.com |Charles Shepherd|598.984.9288x02240   |
|83050 Cervantes Station Apt. 929\nWest Stacey, CO 38055|2024-05-01 10:51:13.635554|1928-12-22|gmartinez@example.org      |Robin Richardson|(460)896-9529x1331   |
|779 Taylor Skyway\nLake Jason, NE 60009                |2024-05-01 17:37:11.144365|1995-08-07|brian12@example.net        |Heather Johnson |637.527.5684x809     |
|4774 Benjamin Fields\

# Writing as PARQUET



In [4]:
# Writing as PARQUET with no partitions

path = "/content/write_partitioning/parquet_no_partitions"

df.write.mode("overwrite").format("parquet").save(path)

!ls /content/write_partitioning/parquet_no_partitions

spark.read.format("parquet").load(path).count()

part-00000-6606d89c-7c4c-4ead-98d0-44a92881bce0-c000.snappy.parquet  _SUCCESS


50

In [5]:
# Writing as PARQUET with partitions
from pyspark.sql.functions import *

path = "/content/write_partitioning/parquet_with_partitions"

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

(df#.where("date_part = '20240503'") #com dynamic e este filtro apenas é feito overwrite desta partição e não de todo
#o df. se tiver static faz overwrite de toda a df, por isso é necessário ter cuidado.
 .write
 .mode("overwrite")                                               # overwrites the entire path with the new data
 .partitionBy("date_part")                                        # partition the data by column - creates sub-folders for each partition
 .format("parquet")                                               # format of output
 .save(path))                                                     # path

!ls /content/write_partitioning/parquet_with_partitions

spark.read.format("parquet").load(path).count()

'date_part=20240501'  'date_part=20240502'  'date_part=20240503'  'date_part=20240504'


50

In [7]:
# Checking single partition
spark.read.parquet("/content/write_partitioning/parquet_with_partitions/date_part=20240502").show()

+--------------------+--------------------+----------+--------------------+-----------------+--------------------+
|             address|                date|       dob|               email|             name|               phone|
+--------------------+--------------------+----------+--------------------+-----------------+--------------------+
|87668 Laura Field...|2024-05-02 06:49:...|1922-01-17|spencervirginia@e...|  Shawn Green PhD|   269-736-9834x2214|
|8681 Scott Vista ...|2024-05-02 21:27:...|1968-11-14|sandersmisty@exam...|     Dennis Smith|+1-716-407-0698x8...|
|5378 Mary Flat Ap...|2024-05-02 08:16:...|1922-11-11|fgarrison@example...|   Richard Hunter|   449-770-9227x3608|
|294 David Pike\nY...|2024-05-02 02:01:...|1975-07-18| kgraves@example.com|      Joseph Rios| +1-961-857-2971x066|
|Unit 8379 Box 811...|2024-05-02 09:15:...|1916-08-15| cthomas@example.net|    Paul Calderon|001-437-619-9595x...|
|6640 Alvarado Cro...|2024-05-02 16:47:...|2010-06-16|mrichards@example...|   Sa

# Writing as CSV

https://spark.apache.org/docs/3.5.1/sql-data-sources-csv.html

In [8]:
df.count()

50

In [9]:
path = "/content/write_partitioning/csv_no_partitioning/"

# write as csv
(df
  .write
  .format("csv")
  .mode("overwrite")
  .option("delimiter", "|")
  .option("header", True)
  .save(path))

# listing files in the folder
!ls /content/write_partitioning/csv_no_partitioning/

# read as csv
(spark
  .read
  .options(sep="|", multiLine=True, header=True)
  .csv(path)
  .count())

part-00000-600158a7-778a-448c-b5f6-2ad463075840-c000.csv  _SUCCESS


50

# Writing as JSON

https://spark.apache.org/docs/3.5.1/sql-data-sources-json.html

In [11]:
path = "/content/write_partitioning/json_no_partitioning/"

# write as json
(df
.write
.mode("overwrite")
.format("json")
.save(path))

# listing files in the folder
!ls /content/write_partitioning/json_no_partitioning/

# read as json
(spark
  .read
  .json(path)
  .count())

part-00000-e2f6b026-42a5-41dd-abb3-b0743acc83e2-c000.json  _SUCCESS


50

In [12]:
# reading json as text
spark.read.text(path).show(10, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                   |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"address":"4332 Miguel Junctions\nBrookstown, PA 36236","date":"2024-05-01T06:32:51.604Z","dob":"1936-03-12","email":"jennifercuevas@example.com","name":"Charles Shepherd","phone":"598.984.9288x02240","date_part":"20240501"}       |
|{"address":"83050 Cervantes Station Apt. 929\nWest Stacey, 

In [13]:
# reading json as text
spark.read.json(path).show(10, False)

+-------------------------------------------------------+------------------------+---------+----------+---------------------------+----------------+---------------------+
|address                                                |date                    |date_part|dob       |email                      |name            |phone                |
+-------------------------------------------------------+------------------------+---------+----------+---------------------------+----------------+---------------------+
|4332 Miguel Junctions\nBrookstown, PA 36236            |2024-05-01T06:32:51.604Z|20240501 |1936-03-12|jennifercuevas@example.com |Charles Shepherd|598.984.9288x02240   |
|83050 Cervantes Station Apt. 929\nWest Stacey, CO 38055|2024-05-01T10:51:13.635Z|20240501 |1928-12-22|gmartinez@example.org      |Robin Richardson|(460)896-9529x1331   |
|779 Taylor Skyway\nLake Jason, NE 60009                |2024-05-01T17:37:11.144Z|20240501 |1995-08-07|brian12@example.net        |Heather Johnso

In [14]:
# partition json data + saveAsTable

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

# write as json
(df.write
  .partitionBy("date_part")
  .mode("overwrite")
  .format("json")
  .saveAsTable("tbl_json_part"))

# read as json
spark.table("tbl_json_part").count()

# read as json
spark.sql("show partitions tbl_json_part").show()

+------------------+
|         partition|
+------------------+
|date_part=20240501|
|date_part=20240502|
|date_part=20240503|
|date_part=20240504|
+------------------+



# Append Mode

In [17]:
# Writing as PARQUET with APPEND

path = "/content/write_partitioning/parquet_append"

df.write.mode("append").format("parquet").save(path)

!ls /content/write_partitioning/parquet_append

spark.read.format("parquet").load(path).count()

part-00000-37af8d5b-6d41-4d94-9d77-078b986533fb-c000.snappy.parquet
part-00000-389520c2-e085-46c1-a8c0-35f3232ac7b8-c000.snappy.parquet
part-00000-a874a84c-221f-4839-bff5-96dd32923148-c000.snappy.parquet
_SUCCESS


150