In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import re

# Inicializamos la sesión Spark
spark = SparkSession.builder.getOrCreate()

# Listamos todas las tablas del esquema lh_bronze
all_tables = spark.catalog.listTables("lh_bronze")

# Definimos los geo_limits y años esperados
geo_limits = ["peninsula", "canarias", "baleares", "ceuta", "melilla"]
years = ["2023", "2024", "2025"]

# Creamos la lista de patrones válidos
expected_patterns = [
    f"brz_open_meteo_{geo}_{year}" for geo in geo_limits for year in years
]

# Filtramos las tablas que coinciden con alguno de los patrones
matched_tables = [t.name for t in all_tables if any(t.name == p for p in expected_patterns)]

print(f"Tablas encontradas ({len(matched_tables)}):")
for t in matched_tables:
    print(f" - {t}")

# Función para extraer geo_limit del nombre de tabla
def extract_geo_limit_from_table_name(table_name):
    for geo in geo_limits:
        if geo in table_name.lower():
            return geo
    return "desconocido"

# Leemos, agregamos geo_limit y unimos todo
df_union = None

for table in matched_tables:
    df = spark.read.table(f"lh_bronze.{table}")
    geo_limit = extract_geo_limit_from_table_name(table)
    
    df = df.withColumn("geo_limit", lit(geo_limit))
    
    if df_union is None:
        df_union = df
    else:
        df_union = df_union.unionByName(df)

display(df_union.limit(10))


StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 3, Finished, Available, Finished)

Tablas encontradas (15):
 - brz_open_meteo_baleares_2023
 - brz_open_meteo_canarias_2023
 - brz_open_meteo_ceuta_2023
 - brz_open_meteo_melilla_2023
 - brz_open_meteo_peninsula_2023
 - brz_open_meteo_baleares_2024
 - brz_open_meteo_canarias_2024
 - brz_open_meteo_ceuta_2024
 - brz_open_meteo_melilla_2024
 - brz_open_meteo_peninsula_2024
 - brz_open_meteo_baleares_2025
 - brz_open_meteo_canarias_2025
 - brz_open_meteo_ceuta_2025
 - brz_open_meteo_melilla_2025
 - brz_open_meteo_peninsula_2025


SynapseWidget(Synapse.DataFrame, 04eac09d-7a80-4d67-9d0c-c2beea18aa66)

In [2]:
print(f"Total de filas iniciales: {df_union.count():,}")

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 4, Finished, Available, Finished)

Total de filas iniciales: 5,080


In [3]:
print(df_union.columns)

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 5, Finished, Available, Finished)

['territorio', 'date', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'ingestion_date', 'geo_limit']


In [4]:
from pyspark.sql import functions as F

# eliminar columnas que tienen todas en nulo
# Contar cuántos valores no nulos tiene cada columna
non_null_counts = df_union.select([
    F.count(F.col(c)).alias(c)
    for c in df_union.columns
]).collect()[0].asDict()

# Filtrar las columnas que tienen al menos un valor no nulo
cols_to_keep = [c for c, count in non_null_counts.items() if count > 0]

# Crear nuevo DataFrame sin las columnas completamente nulas
df_no_null_cols = df_union.select(cols_to_keep)

print(f"Columnas eliminadas: {[c for c in df_union.columns if c not in cols_to_keep]}")
print(f"Total de columnas finales: {len(df_no_null_cols.columns)}")

# Mostrar el resultado
display(df_no_null_cols.limit(5))

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 6, Finished, Available, Finished)

Columnas eliminadas: []
Total de columnas finales: 9


SynapseWidget(Synapse.DataFrame, 2b3cd102-5daf-42d4-89bf-7c45f66975bd)

In [6]:
from pyspark.sql import functions as F

# creamos nuevas columnas "year" y "month" desde "datetime"
df_no_null_cols = (
    df_no_null_cols
    .withColumn("year", F.year(F.col("date")))
    .withColumn("month", F.month(F.col("date")))
)

display(df_no_null_cols.limit(3))

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 36f2476b-e579-41d1-b032-7b76283e0bad)

In [7]:
# eliminamos duplicados según todas las columnas excepto ingestion_timestamp
df_clean = df_no_null_cols.dropDuplicates(['territorio', 'date', 'year', 'month', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'geo_limit'])

display(df_clean.limit(5))
print(f"Total después de limpiar duplicados: {df_clean.count():,}")

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e91f2f15-19b9-4dbf-b9ad-32e0e22ba15d)

Total después de limpiar duplicados: 5,080


In [8]:
# eliminamos las columnas que creamos no necesarios y/o que solo tengan 1 valor único
df_clean = df_clean.drop("territorio", "date", )

display(df_clean.limit(2))

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, dca6108f-a52c-474e-ab33-61f4293ff0f8)

In [9]:
# renombramos las columnas para mejor comprensión
from pyspark.sql.functions import col

df_clean = (
    df_clean
    .withColumnRenamed("temperature_2m_max", "max_temp")
    .withColumnRenamed("temperature_2m_min", "min_temp")
    .withColumnRenamed("temperature_2m_mean", "mean_temp")
    .withColumnRenamed("wind_speed_10m_max", "wind_speed")
)

display(df_clean.limit(2))

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c1f4dcf0-b9e0-497c-bf73-d6eb6f2acf3e)

In [10]:
# guardamos en capa Silver
df_clean.write.mode("overwrite").saveAsTable("lh_silver.slv_open_meteo_day_cleaned")
print("✅ La tabla lh_silver.slv_open_meteo_day_cleaned se ha creado correctamente tras la limpieza.")

StatementMeta(, cd2c2b91-3a83-457e-a301-c2dfb2808231, 12, Finished, Available, Finished)

✅ La tabla lh_silver.slv_open_meteo_day_cleaned se ha creado correctamente tras la limpieza.
