In [0]:
from pyspark.sql.functions import current_timestamp
import re

input_path = "/Volumes/odace/map/files/transport/lignes/"
schema_location = "/Volumes/odace/map/schemas/_bronze_lignes_schema/"
checkpoint_location = "/Volumes/odace/map/checkpoints/_bronze_lignes_checkpoint/"

# Read the stream
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("delimiter", ";")
    .option("header", True)
    .option("inferSchema", True)
    .option("cloudFiles.schemaLocation", schema_location)
    .load(input_path)
)

# Normalize column names: lowercase, replace non-alphanumeric with underscores
normalized_cols = [re.sub(r'[^a-zA-Z0-9]', '_', c).lower() for c in df.columns]
df = df.toDF(*normalized_cols)

# Add ingestion timestamp
df = df.withColumn("ingestion_timestamp", current_timestamp())

# Write to Delta
df.writeStream.format("delta") \
    .option("checkpointLocation", checkpoint_location) \
    .outputMode("append") \
    .trigger(availableNow=True) \
    .table("odace.map.bronze_lignes")

# Note: This cell will start a streaming job using the AvailableNow trigger. New files added to the input path will be ingested in batch mode.
# The schema and checkpoint are stored in the specified volume paths.
# The column 'ingestion_timestamp' records when each row was ingested.
# All column names are normalized to lowercase and underscores.

In [0]:
from pyspark.sql.functions import current_timestamp
import re

input_path = "/Volumes/odace/map/files/transport/gares/"
schema_location = "/Volumes/odace/map/schemas/_bronze_gares_schema/"
checkpoint_location = "/Volumes/odace/map/checkpoints/_bronze_gares_checkpoint/"

# Read the stream
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("delimiter", ";")
    .option("header", True)
    .option("inferSchema", True)
    .option("cloudFiles.schemaLocation", schema_location)
    .load(input_path)
)

# Normalize column names: lowercase, replace non-alphanumeric with underscores
normalized_cols = [re.sub(r'[^a-zA-Z0-9]', '_', c).lower() for c in df.columns]
df = df.toDF(*normalized_cols)

# Add ingestion timestamp
df = df.withColumn("ingestion_timestamp", current_timestamp())

# Write to Delta
df.writeStream.format("delta") \
    .option("checkpointLocation", checkpoint_location) \
    .outputMode("append") \
    .trigger(availableNow=True) \
    .table("odace.map.bronze_gares")

# Note: This cell will start a streaming job using the AvailableNow trigger. New files added to the input path will be ingested in batch mode.
# The schema and checkpoint are stored in the specified volume paths.
# The column 'ingestion_timestamp' records when each row was ingested.
# All column names are normalized to lowercase and underscores.