In [1]:
from collections import defaultdict
import os

# =============================================================================
# CONFIGURATION
# =============================================================================

# ADLS source settings
ADLS_ACCOUNT = "stdretail"
ADLS_CONTAINER = "supermarket"
# Optional subfolder inside the container (leave empty for root)
ADLS_PATH_PREFIX = ""

# Lakehouse destination root
LAKEHOUSE_BASE = "Files/"

# Write behavior
WRITE_MODE = "overwrite"  # "append" or "overwrite"
DRY_RUN = False




StatementMeta(, 39ff9eb6-7bea-48eb-89f1-2ee54923c3c6, 3, Finished, Available, Finished)

In [2]:
def get_param(name, default):
    try:
        from notebookutils import mssparkutils
        return mssparkutils.env.getJobParameter(name, default)
    except Exception:
        return os.getenv(name, default)


def to_bool(value):
    if isinstance(value, bool):
        return value
    return str(value).strip().lower() in ("1", "true", "yes", "y")


ADLS_ACCOUNT = get_param("ADLS_ACCOUNT", ADLS_ACCOUNT)
ADLS_CONTAINER = get_param("ADLS_CONTAINER", ADLS_CONTAINER)
ADLS_PATH_PREFIX = get_param("ADLS_PATH_PREFIX", ADLS_PATH_PREFIX)
LAKEHOUSE_BASE = get_param("LAKEHOUSE_BASE", LAKEHOUSE_BASE)
WRITE_MODE = get_param("WRITE_MODE", WRITE_MODE)
DRY_RUN = to_bool(get_param("DRY_RUN", DRY_RUN))

# Enable Fabric V-Order on write
spark.conf.set("spark.microsoft.fabric.optimizeWrite.enabled", "true")
spark.conf.set("spark.microsoft.fabric.vorder.enabled", "true")


def build_source_root(account, container, prefix):
    base = f"abfss://{container}@{account}.dfs.core.windows.net"
    prefix = prefix.strip("/")
    return f"{base}/{prefix}" if prefix else base


SOURCE_ROOT = build_source_root(ADLS_ACCOUNT, ADLS_CONTAINER, ADLS_PATH_PREFIX)


def get_fs():
    try:
        from notebookutils import mssparkutils
        return mssparkutils.fs
    except Exception:
        return dbutils.fs


FS = get_fs()


def is_dir(item):
    flag = getattr(item, "isDir", None)
    return flag() if callable(flag) else bool(flag)


def list_parquet_files(path):
    files = []
    for entry in FS.ls(path):
        if is_dir(entry):
            files.extend(list_parquet_files(entry.path))
        elif entry.path.lower().endswith(".parquet"):
            files.append(entry.path)
    return files


print(f"Scanning for Parquet files under {SOURCE_ROOT}...")
parquet_files = list_parquet_files(SOURCE_ROOT)
print(f"Found {len(parquet_files)} Parquet files.")

files_by_dir = defaultdict(list)
for path in parquet_files:
    parent_dir = path.rsplit("/", 1)[0]
    files_by_dir[parent_dir].append(path)

for src_dir, files in sorted(files_by_dir.items()):
    rel_dir = src_dir[len(SOURCE_ROOT):].lstrip("/")
    dest_dir = f"{LAKEHOUSE_BASE}/{rel_dir}" if rel_dir else LAKEHOUSE_BASE
    print(f"Copying {len(files)} file(s) from {src_dir} -> {dest_dir}")
    if DRY_RUN:
        continue
    df = spark.read.parquet(*files)
    (df.write
       .format("parquet")
       .mode(WRITE_MODE)
       .save(dest_dir))

print("Copy complete.")

StatementMeta(, 39ff9eb6-7bea-48eb-89f1-2ee54923c3c6, 4, Finished, Available, Finished)

Scanning for Parquet files under abfss://supermarket@stdretail.dfs.core.windows.net...
Found 256 Parquet files.
Copying 1 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/dim_customers -> Files//dim_customers
Copying 1 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/dim_distribution_centers -> Files//dim_distribution_centers
Copying 1 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/dim_geographies -> Files//dim_geographies
Copying 1 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/dim_products -> Files//dim_products
Copying 1 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/dim_stores -> Files//dim_stores
Copying 1 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/dim_trucks -> Files//dim_trucks
Copying 25 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/fact_ble_pings -> Files//fact_ble_pings
Copying 25 file(s) from abfss://supermarket@stdretail.dfs.core.windows.net/fact_dc_invent