In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from pyspark.sql.window import Window

StatementMeta(, af8ad3be-8fc5-492d-9c69-252f489c5d8e, 23, Finished, Available, Finished)

In [2]:
apis = {
    'esios':{'geo_id', 'geo_name'},
    'redata':{'geo_id', 'geo_name'}
    # 'omie':'extraction_date_parsed',
    # 'open_meteo':'date'
}

#spark = SparkSession.builder.getOrCreate()
bronze_lakehouse_id = "0fd09a67-0164-4fb6-838e-02a27c823afc"
silver_lakehouse_id = "cab645cc-8a6f-49ec-a404-a83729660d6f"

spark_bronze = (
    SparkSession.builder
    .appName("ReadFromBronze")
    .config("spark.fabric.lakehouse.name", bronze_lakehouse_id)
    .getOrCreate()
)
# Spark session attached to Silver
spark_silver = (
    SparkSession.builder
    .appName("WriteToSilver")
    .config("spark.fabric.lakehouse.name", silver_lakehouse_id)
    .getOrCreate()
)


silver_table_name = "lh_silver.slv_cod_geo_location"

df_geo_locations = None

StatementMeta(, af8ad3be-8fc5-492d-9c69-252f489c5d8e, 4, Finished, Available, Finished)

In [6]:

for api_name,fields in apis.items():
    name_pattern = f"brz_{api_name}_"
    fields=', '.join(fields)
    # Get tables from the Lakehouse catalog
    all_tables = spark.catalog.listTables()  # returns list of Table objects

    # Filter by prefix
    filtered_tables = [
        t for t in all_tables
        if t.name.startswith(name_pattern)
        #and not t.name.endswith('mensual')
    ]
    # Select distinct dates for API
    for t in filtered_tables:
        df = spark.sql(f"SELECT DISTINCT {fields} FROM {t.name}")
         
        if df_geo_locations is None:
            df_geo_locations = df
        else:
            df_geo_locations = df_geo_locations.unionByName(df, allowMissingColumns=True)

df_geo_locations.head()

StatementMeta(, af8ad3be-8fc5-492d-9c69-252f489c5d8e, 8, Finished, Available, Finished)

Row(geo_id=17, geo_name='Galicia')

In [7]:
df_geo_locations.head(20)

StatementMeta(, af8ad3be-8fc5-492d-9c69-252f489c5d8e, 9, Finished, Available, Finished)

[Row(geo_id=17, geo_name='Galicia'),
 Row(geo_id=4, geo_name='Andalucía'),
 Row(geo_id=11, geo_name='Principado de Asturias'),
 Row(geo_id=18, geo_name='Islas Baleares'),
 Row(geo_id=17, geo_name='Galicia'),
 Row(geo_id=19, geo_name='Islas Canarias'),
 Row(geo_id=10, geo_name='País Vasco'),
 Row(geo_id=7, geo_name='Castilla-La Mancha'),
 Row(geo_id=4, geo_name='Andalucía'),
 Row(geo_id=11, geo_name='Principado de Asturias'),
 Row(geo_id=9, geo_name='Cataluña'),
 Row(geo_id=14, geo_name='Comunidad Foral de Navarra'),
 Row(geo_id=18, geo_name='Islas Baleares'),
 Row(geo_id=5, geo_name='Aragón'),
 Row(geo_id=15, geo_name='Comunidad Valenciana'),
 Row(geo_id=21, geo_name='Región de Murcia'),
 Row(geo_id=20, geo_name='La Rioja'),
 Row(geo_id=6, geo_name='Cantabria'),
 Row(geo_id=17, geo_name='Galicia'),
 Row(geo_id=10, geo_name='País Vasco')]

In [25]:
silver_table_name = "lh_silver.slv_cod_geo_location"


# Ensure the default row
default_row = [(0, 0, "Unknown")]  # (id, geo_id, geo_name)
df_default = spark_silver.createDataFrame(default_row, ["id", "geo_id", "geo_name"])

try:
    df_existing = spark_silver.table(silver_table_name)
    table_exists = True
except:
    table_exists = False

if not table_exists:
    # Write initial table with default row
    (df_default.write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(silver_table_name))
    print(f"Table {silver_table_name} created")

StatementMeta(, af8ad3be-8fc5-492d-9c69-252f489c5d8e, 27, Finished, Available, Finished)

Table lh_silver.slv_cod_geo_location created


In [32]:
# ✅ Add the 'Unknown' row if not present
df_unknown = spark_silver.createDataFrame([(0, "Unknown")], ["geo_id", "geo_name"])

# ✅ Ensure input has no duplicates
df_geo_locations = df_geo_locations.dropDuplicates(["geo_id"])
df_geo_locations = (
    df_geo_locations.unionByName(df_unknown, allowMissingColumns=True)
    .dropDuplicates(["geo_id"])
)

# ✅ Check if Silver table already exists
if not spark_silver.catalog.tableExists(silver_table_name):
    # Assign incremental IDs starting from 0
    df_geo_locations = df_geo_locations.withColumn(
        "location_id", F.row_number().over(Window.orderBy("geo_id")) - 1
    )
    (
        df_geo_locations.select("location_id", "geo_id", "geo_name")
        .write.format("delta")
        .mode("overwrite")
        .saveAsTable(silver_table_name)
    )
    print(f"✅ Created {silver_table_name} with {df_geo_locations.count()} rows")

else:
    # Load existing table
    delta_table = DeltaTable.forName(spark, silver_table_name)
    df_existing = delta_table.toDF()

    # Find max ID and existing geo_ids
    max_id = df_existing.agg(F.max("location_id")).collect()[0][0] or 0
    existing_geo_ids = [r["geo_id"] for r in df_existing.select("geo_id").collect()]

    # Filter new records only
    df_new = df_geo_locations.filter(~F.col("geo_id").isin(existing_geo_ids))

    if df_new.count() > 0:
        # Assign new IDs sequentially after the max existing ID
        df_new = (
            df_new.withColumn("rn", F.row_number().over(Window.orderBy("geo_id")))
            .withColumn("id", (F.col("rn") + F.lit(max_id)).cast("bigint"))
            .drop("rn")
        )
        (
            df_new.select("location_id", "geo_id", "geo_name")
            .write.format("delta")
            .mode("append")
            .saveAsTable(silver_table_name)
        )
        print(f"✅ Appended {df_new.count()} new records")
    else:
        print("ℹ️ No new geo_ids to append")


StatementMeta(, af8ad3be-8fc5-492d-9c69-252f489c5d8e, 34, Finished, Available, Finished)

✅ Created lh_silver.slv_cod_geo_location with 32 rows


In [34]:
spark_silver.table(silver_table_name).orderBy("location_id").show(40)


StatementMeta(, af8ad3be-8fc5-492d-9c69-252f489c5d8e, 36, Finished, Available, Finished)

+-----------+------+--------------------+
|location_id|geo_id|            geo_name|
+-----------+------+--------------------+
|          0|     0|             Unknown|
|          1|     1|            Portugal|
|          2|     2|             Francia|
|          3|     3|              España|
|          4|     4|           Andalucía|
|          5|     5|              Aragón|
|          6|     6|           Cantabria|
|          7|     7|  Castilla-La Mancha|
|          8|     8|     Castilla y León|
|          9|     9|            Cataluña|
|         10|    10|          País Vasco|
|         11|    11|Principado de Ast...|
|         12|    12|               Ceuta|
|         13|    13| Comunidad de Madrid|
|         14|    14|Comunidad Foral d...|
|         15|    15|Comunidad Valenciana|
|         16|    16|         Extremadura|
|         17|    17|             Galicia|
|         18|    18|      Islas Baleares|
|         19|    19|      Islas Canarias|
|         20|    20|            La