<a href="https://colab.research.google.com/github/carsofferrei/04_data_processing/blob/main/spark/examples/06-write_partitioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Write
- .write
- .format (parquet, csv, json)
- options
- spark.sql.sources.partitionOverwriteMode dynamic

# Write Mode
- overwrite - The overwrite mode is used to overwrite the existing file, alternatively, you can use SaveMode.Overwrite
- append - To add the data to the existing file, alternatively, you can use SaveMode.Append
- ignore - Ignores write operation when the file already exists, alternatively, you can use SaveMode.Ignore.
- errorifexists or error - This is a default option when the file already exists, it returns an error, alternatively, you can use SaveMode.ErrorIfExists.

# Partitioning
Process to organize the data into multiple chunks based on some criteria.
Partitions are organized in sub-folders.
Partitioning improves performance in Spark.

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').getOrCreate()

# Preparing data

In [3]:
!pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0


In [4]:
from faker import Faker
from datetime import datetime

fake = Faker()

users = []
for _ in range(50):
    user = {
        'date': fake.date_time_between_dates(datetime(2024, 5, 1), datetime(2024, 5, 5)),
        'name': fake.name(),
        'address': fake.address(),
        'email': fake.email(),
        'dob': fake.date_of_birth(),
        'phone': fake.phone_number()
    }
    users.append(user)

df = spark.createDataFrame(users)

df.show(10, False)


+---------------------------------------------------+--------------------------+----------+--------------------------+------------------+----------------------+
|address                                            |date                      |dob       |email                     |name              |phone                 |
+---------------------------------------------------+--------------------------+----------+--------------------------+------------------+----------------------+
|USNS Conway\nFPO AE 90390                          |2024-05-02 22:32:08.059274|2000-09-09|kathleenwhite@example.net |Brian Ramirez     |547.878.6454          |
|23306 Richards Loaf Apt. 472\nSeanport, OK 99390   |2024-05-03 07:52:11.502189|2005-04-29|caldwelljoanne@example.com|Danielle Myers    |751.218.3248          |
|5752 Oconnor Stravenue\nNorth David, NV 34467      |2024-05-04 17:05:22.245215|1922-06-21|michelle55@example.com    |Wesley Watkins PhD|+1-600-737-1209x55300 |
|6686 Burns Forks\nNormanfort, NV 

# Writing as PARQUET



In [5]:
# Writing as PARQUET with no partitions

path = "/content/write_partitioning/parquet_no_partitions"

df.write.mode("overwrite").format("parquet").save(path)

!ls /content/write_partitioning/parquet_no_partitions

spark.read.format("parquet").load(path).count() #indica o número de linhas que o parquet tem

part-00000-fb4cf8f3-a863-499a-bca1-6c995e9e9a84-c000.snappy.parquet  _SUCCESS


50

In [6]:
# Writing as PARQUET with partitions
from pyspark.sql.functions import *

path = "/content/write_partitioning/parquet_with_partitions"

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

(df#.where("date_part = '20240503'")
 .write
 .mode("overwrite")                                               # overwrites the entire path with the new data
 .partitionBy("date_part")                                        # partition the data by column - creates sub-folders for each partition - se queremos particionar tem mesmo que ser assim para garantir boa performace
 .format("parquet")                                               # format of output
 .save(path))                                                     # path

!ls /content/write_partitioning/parquet_with_partitions

spark.read.format("parquet").load(path).count()

'date_part=20240501'  'date_part=20240502'  'date_part=20240503'  'date_part=20240504'


50

In [25]:
# Writing as PARQUET with partitions by YEAR. MONTH, DAY
from pyspark.sql.functions import *

path = "/content/write_partitioning/parquet_with_partitions_other"

# Creating partition column
df = df\
    .withColumn("year", date_format(col("date"), "yyyy"))\
    .withColumn("month", date_format(col("date"), "MM"))\
    .withColumn("day", date_format(col("date"), "dd"))

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

(df#.where("date_part = '20240503'")
 .write
 .mode("overwrite")                                               # overwrites the entire path with the new data
 .partitionBy(["year", "month", "day"])                                        # partition the data by column - creates sub-folders for each partition - se queremos particionar tem mesmo que ser assim para garantir boa performace
 .format("parquet")                                               # format of output
 .save(path))                                                     # path

!ls /content/write_partitioning/parquet_with_partitions

spark.read.format("parquet").load(path).count()

'date_part=20240501'  'date_part=20240502'  'date_part=20240503'  'date_part=20240504'


50

In [None]:
# Duas formas diferentes de particionar:
# /parquet/20240502

# Spark way - hive-style
# /parquet/date_part=20240502

In [22]:
# Checking single partition
spark.read.parquet("/content/write_partitioning/parquet_with_partitions/date_part=20240502").show()


# OU:
delta = "/content/write_partitioning/parquet_with_partitions/date_part=20240502"
spark.read.format("parquet").load(delta).show()

# OU:
my_table = "/content/write_partitioning/parquet_with_partitions/date_part=20240502"
spark.read.format("delta").load(my_table).show()

+--------------------+--------------------+----------+--------------------+------------------+--------------------+
|             address|                date|       dob|               email|              name|               phone|
+--------------------+--------------------+----------+--------------------+------------------+--------------------+
|USNS Conway\nFPO ...|2024-05-02 22:32:...|2000-09-09|kathleenwhite@exa...|     Brian Ramirez|        547.878.6454|
|2511 Velasquez La...|2024-05-02 23:05:...|2019-07-18|  jose81@example.com|   Nathan Thompson|001-248-446-3967x...|
|0584 Hicks Road\n...|2024-05-02 05:10:...|2015-06-10|katherinebrown@ex...|Brittany Rodriguez|       (937)245-6806|
|PSC 8012, Box 861...|2024-05-02 11:00:...|2013-05-29|melanie83@example...|       Edwin Young|        784-244-3003|
|360 Nathan Trail ...|2024-05-02 18:27:...|1968-04-15|avilajacob@exampl...|     Regina Miller|        798-255-9495|
|7616 Harris Drive...|2024-05-02 04:11:...|1930-01-05|snydershawn@examp.

Py4JJavaError: An error occurred while calling o206.load.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:594)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:527)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 15 more


# Writing as CSV

https://spark.apache.org/docs/3.5.1/sql-data-sources-csv.html

In [8]:
df.count()

50

In [10]:
path = "/content/write_partitioning/csv_no_partitioning/"

# write as csv
(df
  .write
  .format("csv")
  .mode("overwrite")
  .option("delimiter", "|")
  .option("header", True)
  .save(path))

# listing files in the folder
!ls /content/write_partitioning/csv_no_partitioning/

# read as csv
(spark
  .read
  .options(sep="|", multiLine=True, header=True)
  .csv(path)
  .count())

part-00000-c07c7f8d-ac57-4e64-84dc-af94dfe98aa2-c000.csv  _SUCCESS


50

# Writing as JSON

https://spark.apache.org/docs/3.5.1/sql-data-sources-json.html

In [None]:
path = "/content/write_partitioning/json_no_partitioning/"

# write as json
(df
.write
.mode("overwrite")
.format("json")
.save(path))

# listing files in the folder
!ls /content/write_partitioning/json_no_partitioning/

# read as json
(spark
  .read
  .json(path)
  .count())

part-00000-4278fb78-1cc4-4622-9bb4-6cdadc50a8f2-c000.json  _SUCCESS


50

In [None]:
# reading json as text
spark.read.text(path).show(10, False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                 |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"address":"USCGC Garza\nFPO AE 93586","date":"2024-05-01T01:48:40.572Z","dob":"1985-03-17","email":"tinaperry@example.net","name":"Connie Miller","phone":"966-288-5241","date_part":"20240501"}                                     |
|{"address":"327 Harris Mall\nRichstad, TX 25133","date":"2024-05-02

In [None]:
# reading json as json
spark.read.json(path).show(10, False)

+-------------------------------------------------+------------------------+---------+----------+--------------------------+-------------------+--------------------+
|address                                          |date                    |date_part|dob       |email                     |name               |phone               |
+-------------------------------------------------+------------------------+---------+----------+--------------------------+-------------------+--------------------+
|USCGC Garza\nFPO AE 93586                        |2024-05-01T01:48:40.572Z|20240501 |1985-03-17|tinaperry@example.net     |Connie Miller      |966-288-5241        |
|327 Harris Mall\nRichstad, TX 25133              |2024-05-02T11:43:30.828Z|20240502 |1939-06-04|timothy69@example.net     |Bryan Cook         |(464)497-5312x67440 |
|Unit 0282 Box 3495\nDPO AA 47314                 |2024-05-04T00:33:05.895Z|20240504 |1992-07-30|qward@example.net         |Crystal Smith      |(430)385-0871x4879  |
|633

In [11]:
# partition json data + saveAsTable

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

# write as json
(df.write
  .partitionBy("date_part")
  .mode("overwrite")
  .format("json")
  .saveAsTable("tbl_json_part"))

# read as json
spark.table("tbl_json_part").count()

# read as json
spark.sql("show partitions tbl_json_part").show()

+------------------+
|         partition|
+------------------+
|date_part=20240501|
|date_part=20240502|
|date_part=20240503|
|date_part=20240504|
+------------------+



# Append Mode

In [14]:
# Writing as PARQUET with APPEND

path = "/content/write_partitioning/parquet_append"

df.write.mode("append").format("parquet").save(path)

!ls /content/write_partitioning/parquet_append

spark.read.format("parquet").load(path).count()

part-00000-0e63486c-ceb2-4091-8e36-87ec899929ec-c000.snappy.parquet
part-00000-5112712e-2fd9-48c4-ba92-b97a63d00f2a-c000.snappy.parquet
part-00000-dd17ce85-d320-4980-880c-a6f774e6efa7-c000.snappy.parquet
_SUCCESS


150