In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import re

# Inicializamos Spark
spark = SparkSession.builder.getOrCreate()

# Listamos todas las tablas del esquema lh_bronze
all_tables = spark.catalog.listTables("lh_bronze")

# Definimos los geo_limits y años esperados
geo_limits = ["peninsula", "canarias", "baleares", "ceuta", "melilla"]
years = ["2023", "2024", "2025"]

# Creamos la lista de patrones válidos
expected_patterns = [
    f"brz_open_meteo_{geo}_{year}" for geo in geo_limits for year in years
]

# Filtramos las tablas que coinciden con alguno de los patrones
matched_tables = [t.name for t in all_tables if any(t.name == p for p in expected_patterns)]

print(f"✅ Tablas encontradas ({len(matched_tables)}):")
for t in matched_tables:
    print(f" - {t}")

# Función para extraer geo_limit desde el nombre de la tabla
def extract_geo_limit_from_table_name(table_name):
    for geo in geo_limits:
        if geo in table_name.lower():
            return geo
    return "desconocido"

# Leemos, agregamos geo_limit y unimos todas las tablas
df_union = None

for table in matched_tables:
    df = spark.read.table(f"lh_bronze.{table}")
    geo_limit = extract_geo_limit_from_table_name(table)
    df = df.withColumn("geo_limit", lit(geo_limit))
    
    # Si existe la columna 'territorio', la eliminamos directamente
    if 'territorio' in df.columns:
        df = df.drop('territorio')
    
    # Unimos las tablas
    if df_union is None:
        df_union = df
    else:
        df_union = df_union.unionByName(df)

# Mostramos muestra
display(df_union.limit(10))


StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 23, Finished, Available, Finished)

✅ Tablas encontradas (15):
 - brz_open_meteo_baleares_2023
 - brz_open_meteo_canarias_2023
 - brz_open_meteo_ceuta_2023
 - brz_open_meteo_melilla_2023
 - brz_open_meteo_peninsula_2023
 - brz_open_meteo_baleares_2024
 - brz_open_meteo_canarias_2024
 - brz_open_meteo_ceuta_2024
 - brz_open_meteo_melilla_2024
 - brz_open_meteo_peninsula_2024
 - brz_open_meteo_baleares_2025
 - brz_open_meteo_canarias_2025
 - brz_open_meteo_ceuta_2025
 - brz_open_meteo_melilla_2025
 - brz_open_meteo_peninsula_2025


SynapseWidget(Synapse.DataFrame, f726a79e-0e06-44d7-a32b-603f70fe4235)

In [22]:
# imprimimos valores únicos de la columna geo_limit
cols = ['geo_limit']

for c in cols:
    uniques = [row[c] for row in df_union.select(c).distinct().collect()]
    count_uniques = df_union.select(c).distinct().count()
    print(f"\n📌 Valores únicos en '{c}': {count_uniques}")
    print(uniques)

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 24, Finished, Available, Finished)


📌 Valores únicos en 'geo_limit': 5
['baleares', 'canarias', 'ceuta', 'melilla', 'peninsula']


In [23]:
# normalizamos de "peninsula" a "peninsular" para que cuadre con las otras tablas
df_union = df_union.replace({"peninsula": "peninsular"}, subset=["geo_limit"])

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 25, Finished, Available, Finished)

In [24]:
print(f"Total de filas iniciales: {df_union.count():,}")

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 26, Finished, Available, Finished)

Total de filas iniciales: 5,080


In [25]:
print(df_union.columns)

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 27, Finished, Available, Finished)

['date', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'ingestion_date', 'geo_limit']


In [26]:
from pyspark.sql import functions as F

# eliminar columnas que tienen todas en nulo
# Contar cuántos valores no nulos tiene cada columna
non_null_counts = df_union.select([
    F.count(F.col(c)).alias(c)
    for c in df_union.columns
]).collect()[0].asDict()

# Filtrar las columnas que tienen al menos un valor no nulo
cols_to_keep = [c for c, count in non_null_counts.items() if count > 0]

# Crear nuevo DataFrame sin las columnas completamente nulas
df_no_null_cols = df_union.select(cols_to_keep)

print(f"Columnas eliminadas: {[c for c in df_union.columns if c not in cols_to_keep]}")
print(f"Total de columnas finales: {len(df_no_null_cols.columns)}")

# Mostrar el resultado
display(df_no_null_cols.limit(5))

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 28, Finished, Available, Finished)

Columnas eliminadas: []
Total de columnas finales: 8


SynapseWidget(Synapse.DataFrame, 3c260195-698f-4e38-8ce1-2e5b8f4bb404)

In [27]:
from pyspark.sql import functions as F

# creamos nuevas columnas "year" y "month" desde "datetime"
df_no_null_cols = (
    df_no_null_cols
    .withColumn("year", F.year(F.col("date")))
    .withColumn("month", F.month(F.col("date")))
)

display(df_no_null_cols.limit(3))

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 29, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, fbdf5162-89fe-4dea-a88b-35c7314c86cc)

In [28]:
# eliminamos duplicados según todas las columnas excepto ingestion_timestamp
df_clean = df_no_null_cols.dropDuplicates(['date', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'geo_limit']
)

display(df_clean.limit(5))
print(f"Total después de limpiar duplicados: {df_clean.count():,}")

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 30, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 917f37bd-4067-4140-af3b-46a94f6a28a9)

Total después de limpiar duplicados: 5,070


In [29]:
# eliminamos las columnas que creamos no necesarios y/o que solo tengan 1 valor único
df_clean = df_clean.drop("date", "ingestion_date")

display(df_clean.limit(2))

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 31, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e2b95e26-0b43-4eb1-bfef-f5ebf02e11a2)

In [30]:
# renombramos las columnas para mejor comprensión
from pyspark.sql.functions import col

df_clean = (
    df_clean
    .withColumnRenamed("temperature_2m_max", "max_temp")
    .withColumnRenamed("temperature_2m_min", "min_temp")
    .withColumnRenamed("temperature_2m_mean", "mean_temp")
    .withColumnRenamed("wind_speed_10m_max", "wind_speed")
)

display(df_clean.limit(2))

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 32, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 047eecb3-2fa2-4b43-b1d8-a8b01b0a67f5)

In [31]:
# imprimimos valores únicos de la columna geo_limit
cols = ['geo_limit']

for c in cols:
    uniques = [row[c] for row in df_clean.select(c).distinct().collect()]
    count_uniques = df_clean.select(c).distinct().count()
    print(f"\n📌 Valores únicos en '{c}': {count_uniques}")
    print(uniques)

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 33, Finished, Available, Finished)


📌 Valores únicos en 'geo_limit': 5
['baleares', 'canarias', 'ceuta', 'melilla', 'peninsular']


In [32]:
# guardamos en capa Silver
df_clean.write.mode("overwrite").saveAsTable("lh_silver.slv_open_meteo_day_cleanedd")
print("✅ La tabla lh_silver.slv_open_meteo_day_cleanedd se ha creado correctamente tras la limpieza.")

StatementMeta(, 345c8f4b-9113-4e6a-97a7-6bd431130e43, 34, Finished, Available, Finished)

✅ La tabla lh_silver.slv_open_meteo_day_cleanedd se ha creado correctamente tras la limpieza.
