In [1]:
from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession

spark_session = (configure_spark_with_delta_pip(SparkSession.builder.master("local[*]")
                                                        .config("spark.sql.catalogImplementation", "hive")
                                                .config("spark.sql.extensions",
                                                        "io.delta.sql.DeltaSparkSessionExtension")
                                                .config("spark.sql.catalog.spark_catalog",
                                                        "org.apache.spark.sql.delta.catalog.DeltaCatalog")
                                                ).getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/23 06:20:07 WARN Utils: Your hostname, bartosz, resolves to a loopback address: 127.0.1.1; using 192.168.1.55 instead (on interface wlp0s20f3)
25/08/23 06:20:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/bartosz/.venvs/delta_spark_4/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/bartosz/.ivy2.5.2/cache
The jars for the packages stored in: /home/bartosz/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7382ed67-4f7d-4d0e-9078-0d808d0331f1;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in local-m2-cache
:: resolution report :: resolve 185ms :: artifacts dl 7ms


**Create input tables first**

In [3]:
rm -rf ./spark-warehouse && rm -rf ./metastore_db/

In [4]:
tables = ['table_1', 'table_2']
for table in tables:
    print(f'Creating {table}')
    spark_session.sql(f'DROP TABLE IF EXISTS `default`.`{table}`')
    spark_session.sql(f'''
              CREATE TABLE `default`.`{table}` (
                 number INT NOT NULL,
                letter STRING NOT NULL
              ) USING DELTA
            ''')


Creating table_1


25/08/23 06:20:21 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/08/23 06:20:21 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore bartosz@127.0.1.1
25/08/23 06:20:21 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
25/08/23 06:20:24 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`default`.`table_1` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
25/08/23 06:20:24 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/08/23 06:20:24 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist


Creating table_2


25/08/23 06:20:25 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`default`.`table_2` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


# Transaction compensation

Let's see how the transaction compensation works by adding rows to the first of two tables:

In [5]:
from pyspark import Row

data_to_write = spark_session.createDataFrame([Row(number=1, letter='a'), Row(number=2, letter='b'), Row(number=3, letter='c')])

data_to_write.write.format('delta').mode('overwrite').insertInto('table_1')
spark_session.read.table('table_1').show()

25/08/23 06:20:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------+------+
|number|letter|
+------+------+
|     3|     c|
|     1|     a|
|     2|     b|
+------+------+



Let's now suppose we want to write the same dataset to the *table_2* but the writer fails with some random exception.

In [6]:
raise RuntimeError('Some error occurred')

RuntimeError: Some error occurred

If we need to apply the **`Transaction compensation`** with Delta Lake, we need to:
* find the previous valid version of the correctly written tables (table_1 in our case)
* call the `RESTORE` function to rollback the table to this most recent version

In [7]:
#### in your production code you can keep a list of the successfuly written tables; for our simplistic use case it would be overkill
table_to_restore = '`default`.`table_1`'
spark_session.sql(f'DESCRIBE HISTORY {table_to_restore}').createOrReplaceTempView('history')
version_to_restore_df = spark_session.sql(f'''
SELECT MIN(version) AS restored_version FROM (
    SELECT * FROM history v ORDER BY version DESC LIMIT 2
) vv
''').collect()
version_to_restore = version_to_restore_df[0].restored_version

print(f'{table_to_restore} will be restored to {version_to_restore}')
spark_session.sql(f'RESTORE {table_to_restore} TO VERSION AS OF {version_to_restore}')


`default`.`table_1` will be restored to 0


                                                                                

DataFrame[table_size_after_restore: bigint, num_of_files_after_restore: bigint, num_removed_files: bigint, num_restored_files: bigint, removed_files_size: bigint, restored_files_size: bigint]

In [8]:
spark_session.read.table('table_1').show()

+------+------+
|number|letter|
+------+------+
+------+------+



# Wrap-up
Let's wrap the *Transaction compensation* into a more realistic code:

In [9]:
def restore_table_to_previous_version(table_to_restore: str):
    spark_session.sql(f'DESCRIBE HISTORY {table_to_restore}').createOrReplaceTempView('history')
    version_to_restore_df = spark_session.sql(f'''
    SELECT MIN(version) AS restored_version FROM (
        SELECT * FROM history v ORDER BY version DESC LIMIT 2
    ) vv
    ''').collect()
    version_to_restore = version_to_restore_df[0].restored_version
    
    print(f'{table_to_restore} will be restored to {version_to_restore}')
    spark_session.sql(f'RESTORE {table_to_restore} TO VERSION AS OF {version_to_restore}')


In [10]:
from pyspark import Row

data_to_write = spark_session.createDataFrame([Row(number=1, letter='a'), Row(number=2, letter='b'), Row(number=3, letter='c')])

successfully_written_tables = []

def write_data(table: str):
    try:
        if table == 'table_2':
            raise RuntimeError('Some random error')
        data_to_write.write.format('delta').mode('overwrite').insertInto(table)
        successfully_written_tables.append(table)
    except:
        for table_to_restore in successfully_written_tables:
            restore_table_to_previous_version(table_to_restore)

write_data('table_1')
write_data('table_2')

spark_session.read.table('table_1').show()
spark_session.read.table('table_2').show()

                                                                                

table_1 will be restored to 2


                                                                                

+------+------+
|number|letter|
+------+------+
+------+------+

+------+------+
|number|letter|
+------+------+
+------+------+

