# Backup a Catalog
This notebook uses [DiscoverX](https://github.com/databrickslabs/discoverx) to clone all schemas and tables from a source catalog into a destination catalog using Delta Lake `CLONE`.
After cloning, it removes any tables or schemas in the destination that no longer exist in the source.

Specify the source and destination catalogs with the widgets below and run all cells.


In [0]:
%pip install dbl-discoverx
dbutils.library.restartPython()

## Configure source and destination catalogs


In [0]:

# Create widgets for user input using drop-downs populated from existing catalogs
catalogs = ['None Selected'] + [row.catalog for row in spark.sql("SHOW CATALOGS").collect()]
dbutils.widgets.combobox("1.source_catalog", "None Selected", catalogs, label="1. Source Catalog")
dbutils.widgets.combobox("2.destination_catalog", "None Selected", catalogs, label="2. Destination Catalog")


In [0]:
source_catalog = dbutils.widgets.get("1.source_catalog")
destination_catalog = dbutils.widgets.get("2.destination_catalog")


In [0]:
allowed_catalogs = {'dev', 'staging', 'prod'}
if source_catalog not in allowed_catalogs:
    raise ValueError(f'Source catalog must be one of {sorted(allowed_catalogs)}')
expected_dest = f'{source_catalog}_backup'
if destination_catalog != expected_dest:
    raise ValueError(f'Destination catalog must be {expected_dest}')


## Clone all tables using DiscoverX


In [0]:

from discoverx import DX

dx = DX()

def clone_table(table_info):
    """Clone a single table into the destination catalog."""
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{destination_catalog}`.`{table_info.schema}`")
    try:
        spark.sql(
            f"""CREATE OR REPLACE TABLE `{destination_catalog}`.`{table_info.schema}`.`{table_info.table}` CLONE `{table_info.catalog}`.`{table_info.schema}`.`{table_info.table}`"""
        )
        return {
            "source": f"`{table_info.catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "destination": f"`{destination_catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "success": True,
            "info": None,
        }
    except Exception as err:
        return {
            "source": f"`{table_info.catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "destination": f"`{destination_catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "success": False,
            "info": str(err),
        }


# Apply clone function to all tables in the source catalog
results = dx.from_tables(f"{source_catalog}.*.*").map(clone_table)


## Show cloning results


In [0]:
from pyspark.sql.types import StructType,StructField,StringType,BooleanType
schema=StructType([
    StructField("source",StringType(),True),
    StructField("destination",StringType(),True),
    StructField("success",BooleanType(),False),
    StructField("info",StringType(),True)
])
df = spark.createDataFrame(results,schema)
df.display()

## Migrate Schema and Table Permissions
Copy permissions from the source catalog to the destination.

In [0]:
schema_query = f"""
SELECT schema_name, grantee, privilege_type
FROM system.information_schema.schema_privileges
WHERE catalog_name = '{source_catalog}'
"""

table_query = f"""
SELECT table_schema, table_name, grantee, privilege_type
FROM system.information_schema.table_privileges
WHERE table_catalog = '{source_catalog}'
"""

schema_df = spark.sql(schema_query)
table_df = spark.sql(table_query)

grant_cmds = []
for row in schema_df.collect():
    if row.schema_name.lower() == 'information_schema':
        continue
    object_identifier = f"`{destination_catalog}`.`{row.schema_name}`"
    grant_cmds.append(f"GRANT {row.privilege_type} ON SCHEMA {object_identifier} TO `{row.grantee}`;")

for row in table_df.collect():
    if row.table_schema.lower() == 'information_schema':
        continue
    object_identifier = f"`{destination_catalog}`.`{row.table_schema}`.`{row.table_name}`"
    grant_cmds.append(f"GRANT {row.privilege_type} ON TABLE {object_identifier} TO `{row.grantee}`;")

for cmd in grant_cmds:
    print(cmd)
    spark.sql(cmd)


## Remove stale tables from the destination catalog


In [0]:

# Determine tables present only in the destination catalog
source_df = spark.sql(f"""
    SELECT table_schema, table_name
    FROM `{source_catalog}`.information_schema.tables
    WHERE table_schema NOT IN ('information_schema')
"""
)
dest_df = spark.sql(f"""
    SELECT table_schema, table_name
    FROM `{destination_catalog}`.information_schema.tables
    WHERE table_schema NOT IN ('information_schema')
"""
)
source_tables = {(r.table_schema, r.table_name) for r in source_df.collect()}
dest_tables = {(r.table_schema, r.table_name) for r in dest_df.collect()}

obsolete_tables = dest_tables - source_tables
for schema, table in sorted(obsolete_tables):
    spark.sql(f"DROP TABLE IF EXISTS `{destination_catalog}`.`{schema}`.`{table}`")
    print(f'Dropped `{destination_catalog}`.`{schema}`.`{table}`')

# Drop any empty schemas that remain
schema_df = spark.sql(f'SHOW SCHEMAS IN `{destination_catalog}`')
for row in schema_df.collect():
    schema = row.schemaName if hasattr(row, 'schemaName') else row[0]
    if schema == 'information_schema':
        continue
    if not spark.sql(f'SHOW TABLES IN `{destination_catalog}`.`{schema}`').collect():
        spark.sql(f'DROP SCHEMA IF EXISTS `{destination_catalog}`.`{schema}`')
        print(f'Dropped schema `{destination_catalog}`.`{schema}`')
