In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, monotonically_increasing_id, row_number,
    year, month, dayofmonth, quarter, dayofweek, date_format,
    when, concat_ws, floor
)
from pyspark.sql.window import Window

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 3, Finished, Available, Finished)

In [2]:
# ============================================
# CONFIGURATION
# ============================================
LAKEHOUSE_SILVER = "lh_silver"
LAKEHOUSE_GOLD = "lh_golden"

spark = SparkSession.builder.appName("Gold_Dimensions").getOrCreate()

print("=" * 80)
print("🗂️  CREATING GOLD DIMENSIONS (MONTHLY DATA ONLY)")
print("=" * 80)
print(f"📂 Source: {LAKEHOUSE_SILVER}")
print(f"📂 Target: {LAKEHOUSE_GOLD}\n")

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 4, Finished, Available, Finished)

🗂️  CREATING GOLD DIMENSIONS (MONTHLY DATA ONLY)
📂 Source: lh_silver
📂 Target: lh_golden



In [3]:
# ============================================
# 1. DIM_DATE
# ============================================
print("\n" + "=" * 80)
print("📅 1. CREATING dim_date")
print("=" * 80)

# ✅ SOLO TABLAS _month (eliminado _day y union)
df_dates = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_month") \
    .select("datetime") \
    .distinct()

print(f"  📊 Unique dates extracted: {df_dates.count():,}")

# Create date dimension
dim_date = df_dates.select(
    col("datetime").alias("date"),
    year("datetime").alias("year"),
    month("datetime").alias("month"),
    dayofmonth("datetime").alias("day"),
    quarter("datetime").alias("quarter"),
    (floor((month("datetime") - 1) / 6) + 1).cast("integer").alias("semester"),
    dayofweek("datetime").alias("day_of_week_num"),
    date_format("datetime", "EEEE").alias("day_of_week_name"),
    date_format("datetime", "MMMM").alias("month_name"),
    when(dayofweek("datetime").isin([1, 7]), True).otherwise(False).alias("is_weekend")
).distinct()

# Add surrogate key
window = Window.orderBy("date")
dim_date = dim_date.withColumn("date_key", row_number().over(window))

# Reorder columns
dim_date = dim_date.select(
    "date_key", "date", "year", "month", "day", "quarter", "semester",
    "day_of_week_num", "day_of_week_name", "month_name", "is_weekend"
)

print(f"  ✅ Records generated: {dim_date.count():,}")
print(f"  📋 Columns: {len(dim_date.columns)}")

# Write
dim_date.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{LAKEHOUSE_GOLD}.dim_date")

print("  💾 Table dim_date created\n")

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 5, Finished, Available, Finished)


📅 1. CREATING dim_date
  📊 Unique dates extracted: 34
  ✅ Records generated: 34
  📋 Columns: 11
  💾 Table dim_date created



In [4]:
# ============================================
# 2. DIM_GEOGRAPHY
# ============================================
print("=" * 80)
print("🌍 2. CREATING dim_geography")
print("=" * 80)

# ✅ SOLO TABLAS _month (eliminado _day y union)
df_geo = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_month") \
    .select("geo_id", "geo_name") \
    .distinct()

print(f"  📊 Unique geographies: {df_geo.count():,}")

# Add surrogate key
window = Window.orderBy("geo_id")
dim_geography = df_geo.withColumn("geography_key", row_number().over(window))

# Reorder - only geo_id and geo_name
dim_geography = dim_geography.select(
    "geography_key", 
    "geo_id", 
    "geo_name"
)

print(f"  ✅ Records generated: {dim_geography.count():,}")

# Show sample to verify no duplicates
print("\n  📋 Sample records:")
dim_geography.orderBy("geo_id").show(10, truncate=False)

# Write
dim_geography.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{LAKEHOUSE_GOLD}.dim_geography")

print("  💾 Table dim_geography created\n")


StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 6, Finished, Available, Finished)

🌍 2. CREATING dim_geography
  📊 Unique geographies: 20
  ✅ Records generated: 20

  📋 Sample records:
+-------------+------+----------------------+
|geography_key|geo_id|geo_name              |
+-------------+------+----------------------+
|1            |4     |Andalucía             |
|2            |5     |Aragón                |
|3            |6     |Cantabria             |
|4            |7     |Castilla-La Mancha    |
|5            |8     |Castilla y León       |
|6            |9     |Cataluña              |
|7            |10    |País Vasco            |
|8            |11    |Principado de Asturias|
|9            |13    |Comunidad de Madrid   |
|10           |14    |Comunidad de Navarra  |
+-------------+------+----------------------+
only showing top 10 rows

  💾 Table dim_geography created



In [5]:
# ============================================
# 3. DIM_TECHNOLOGY
# ============================================
print("=" * 80)
print("⚡ 3. CREATING dim_technology")
print("=" * 80)

# ✅ PASO 1: Obtener categorías SOLO de tabla _month
df_base = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_month") \
    .select("series_type", "series_attribute_type") \
    .distinct()

print(f"  📊 Technologies in generation structure: {df_base.count():,}")

# ✅ PASO 2: Obtener emisiones SOLO de tabla _month
df_emissions = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_emisiones_asociadas_month") \
    .select("series_type", col("series_attribute_type").alias("emissions_attr")) \
    .distinct()

print(f"  📊 Technologies in emissions table: {df_emissions.count():,}")

# PASO 3: JOIN para combinar ambas fuentes
dim_technology = df_base.join(df_emissions, "series_type", "left")

# PASO 4: Crear columnas finales
dim_technology = dim_technology.withColumn(
    "category",
    col("series_attribute_type")
).withColumn(
    "has_co2_emissions",
    when(col("emissions_attr") == "Con emisiones de CO2 eq.", True)
    .when(col("emissions_attr") == "Sin emisiones de CO2 eq.", False)
    .otherwise(None)  # NULL para casos sin info
)

# Drop auxiliary columns
dim_technology = dim_technology.drop("series_attribute_type", "emissions_attr")

print(f"  ✅ Technologies processed: {dim_technology.count():,}")

# Add surrogate key
window = Window.orderBy("series_type")
dim_technology = dim_technology.withColumn("technology_key", row_number().over(window))

# Reorder
dim_technology = dim_technology.select(
    "technology_key", 
    "series_type", 
    "category", 
    "has_co2_emissions"
)

print(f"  ✅ Records generated: {dim_technology.count():,}")

# Verify distribution
print("\n  📊 Distribution by category:")
dim_technology.groupBy("category").count().orderBy("category").show()

print("\n  📊 CO2 emissions distribution:")
dim_technology.groupBy("has_co2_emissions").count().show()

print("\n  ✅ Technologies WITH emissions:")
dim_technology.filter(col("has_co2_emissions") == True) \
    .select("series_type", "category") \
    .orderBy("series_type") \
    .show(20, truncate=False)

print("\n  ❌ Technologies WITHOUT emissions:")
dim_technology.filter(col("has_co2_emissions") == False) \
    .select("series_type", "category") \
    .orderBy("series_type") \
    .show(20, truncate=False)

# Write
dim_technology.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{LAKEHOUSE_GOLD}.dim_technology")

print("  💾 Table dim_technology created\n")

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 7, Finished, Available, Finished)

⚡ 3. CREATING dim_technology
  📊 Technologies in generation structure: 18
  📊 Technologies in emissions table: 16
  ✅ Technologies processed: 18
  ✅ Records generated: 18

  📊 Distribution by category:
+----------------+-----+
|        category|count|
+----------------+-----+
|Generación total|    1|
|    No-Renovable|    9|
|       Renovable|    8|
+----------------+-----+


  📊 CO2 emissions distribution:
+-----------------+-----+
|has_co2_emissions|count|
+-----------------+-----+
|             NULL|    1|
|             true|    9|
|            false|    8|
+-----------------+-----+


  ✅ Technologies WITH emissions:
+----------------------+------------+
|series_type           |category    |
+----------------------+------------+
|Carbón                |No-Renovable|
|Ciclo combinado       |No-Renovable|
|Cogeneración          |No-Renovable|
|Fuel + Gas            |No-Renovable|
|Motores diésel        |No-Renovable|
|Residuos no renovables|Renovable   |
|Residuos no renovables|No-Ren

In [6]:
# ============================================
# FINAL SUMMARY
# ============================================
print("=" * 80)
print("📊 SUMMARY OF CREATED DIMENSIONS")
print("=" * 80)

for dim_table in ["dim_date", "dim_geography", "dim_technology"]:
    count = spark.table(f"{LAKEHOUSE_GOLD}.{dim_table}").count()
    cols = len(spark.table(f"{LAKEHOUSE_GOLD}.{dim_table}").columns)
    print(f"  ✅ {dim_table}: {count:,} records, {cols} columns")

print("\n" + "=" * 80)
print("✅ DIMENSIONS COMPLETED (MONTHLY DATA)")
print("=" * 80)

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 8, Finished, Available, Finished)

📊 SUMMARY OF CREATED DIMENSIONS
  ✅ dim_date: 34 records, 11 columns
  ✅ dim_geography: 20 records, 3 columns
  ✅ dim_technology: 18 records, 4 columns

✅ DIMENSIONS COMPLETED (MONTHLY DATA)
