# Backup a Catalog
This notebook uses [DiscoverX](https://github.com/databrickslabs/discoverx) to clone all schemas and tables from a source catalog into a destination catalog using Delta Lake `CLONE`.
After cloning, it removes any tables or schemas in the destination that no longer exist in the source.

Specify the source and destination catalogs with the widgets below and run all cells.


In [None]:

%pip install dbl-discoverx
dbutils.library.restartPython()


In [None]:

%md
## Configure source and destination catalogs


In [None]:

# Create widgets for user input
dbutils.widgets.text("1.source_catalog", "source_catalog")
dbutils.widgets.text("2.destination_catalog", "destination_catalog")

source_catalog = dbutils.widgets.get("1.source_catalog")
destination_catalog = dbutils.widgets.get("2.destination_catalog")


In [None]:

%md
## Clone all tables using DiscoverX


In [None]:

from discoverx import DX

dx = DX()

spark.sql(f"CREATE CATALOG IF NOT EXISTS `{destination_catalog}`")


def clone_table(table_info):
    """Clone a single table into the destination catalog."""
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{destination_catalog}`.`{table_info.schema}`")
    try:
        spark.sql(
            f"""CREATE OR REPLACE TABLE `{destination_catalog}`.`{table_info.schema}`.`{table_info.table}` CLONE `{table_info.catalog}`.`{table_info.schema}`.`{table_info.table}`"""
        )
        return {
            "source": f"`{table_info.catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "destination": f"`{destination_catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "success": True,
            "info": None,
        }
    except Exception as err:
        return {
            "source": f"`{table_info.catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "destination": f"`{destination_catalog}`.`{table_info.schema}`.`{table_info.table}`",
            "success": False,
            "info": str(err),
        }


# Apply clone function to all tables in the source catalog
results = dx.from_tables(f"{source_catalog}.*.*").map(clone_table)


In [None]:

%md
## Show cloning results


In [None]:
import json
for item in results:
    print(json.dumps(item, indent=4))

In [None]:

%md
## Remove stale tables from the destination catalog


In [None]:

# Determine tables present only in the destination catalog
source_df = spark.sql(f"""
    SELECT table_schema, table_name
    FROM `{source_catalog}`.information_schema.tables
    WHERE table_schema NOT IN ('information_schema')
"""
)
dest_df = spark.sql(f"""
    SELECT table_schema, table_name
    FROM `{destination_catalog}`.information_schema.tables
    WHERE table_schema NOT IN ('information_schema')
"""
)
source_tables = {(r.table_schema, r.table_name) for r in source_df.collect()}
dest_tables = {(r.table_schema, r.table_name) for r in dest_df.collect()}

obsolete_tables = dest_tables - source_tables
for schema, table in sorted(obsolete_tables):
    spark.sql(f"DROP TABLE IF EXISTS `{destination_catalog}`.`{schema}`.`{table}`")
    print(f'Dropped `{destination_catalog}`.`{schema}`.`{table}`')

# Drop any empty schemas that remain
schema_df = spark.sql(f'SHOW SCHEMAS IN `{destination_catalog}`')
for row in schema_df.collect():
    schema = row.schemaName if hasattr(row, 'schemaName') else row[0]
    if schema == 'information_schema':
        continue
    if not spark.sql(f'SHOW TABLES IN `{destination_catalog}`.`{schema}`').collect():
        spark.sql(f'DROP SCHEMA IF EXISTS `{destination_catalog}`.`{schema}`')
        print(f'Dropped schema `{destination_catalog}`.`{schema}`')
