In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, monotonically_increasing_id, row_number,
    year, month, dayofmonth, quarter, dayofweek, date_format,
    when, concat_ws, floor
)
from pyspark.sql.window import Window

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 3, Finished, Available, Finished)

In [2]:
# ============================================
# CONFIGURATION
# ============================================
LAKEHOUSE_SILVER = "lh_silver"
LAKEHOUSE_GOLD = "lh_golden"

spark = SparkSession.builder.appName("Gold_Dimensions").getOrCreate()

print("=" * 80)
print("üóÇÔ∏è  CREATING GOLD DIMENSIONS (MONTHLY DATA ONLY)")
print("=" * 80)
print(f"üìÇ Source: {LAKEHOUSE_SILVER}")
print(f"üìÇ Target: {LAKEHOUSE_GOLD}\n")

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 4, Finished, Available, Finished)

üóÇÔ∏è  CREATING GOLD DIMENSIONS (MONTHLY DATA ONLY)
üìÇ Source: lh_silver
üìÇ Target: lh_golden



In [3]:
# ============================================
# 1. DIM_DATE
# ============================================
print("\n" + "=" * 80)
print("üìÖ 1. CREATING dim_date")
print("=" * 80)

# ‚úÖ SOLO TABLAS _month (eliminado _day y union)
df_dates = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_month") \
    .select("datetime") \
    .distinct()

print(f"  üìä Unique dates extracted: {df_dates.count():,}")

# Create date dimension
dim_date = df_dates.select(
    col("datetime").alias("date"),
    year("datetime").alias("year"),
    month("datetime").alias("month"),
    dayofmonth("datetime").alias("day"),
    quarter("datetime").alias("quarter"),
    (floor((month("datetime") - 1) / 6) + 1).cast("integer").alias("semester"),
    dayofweek("datetime").alias("day_of_week_num"),
    date_format("datetime", "EEEE").alias("day_of_week_name"),
    date_format("datetime", "MMMM").alias("month_name"),
    when(dayofweek("datetime").isin([1, 7]), True).otherwise(False).alias("is_weekend")
).distinct()

# Add surrogate key
window = Window.orderBy("date")
dim_date = dim_date.withColumn("date_key", row_number().over(window))

# Reorder columns
dim_date = dim_date.select(
    "date_key", "date", "year", "month", "day", "quarter", "semester",
    "day_of_week_num", "day_of_week_name", "month_name", "is_weekend"
)

print(f"  ‚úÖ Records generated: {dim_date.count():,}")
print(f"  üìã Columns: {len(dim_date.columns)}")

# Write
dim_date.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{LAKEHOUSE_GOLD}.dim_date")

print("  üíæ Table dim_date created\n")

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 5, Finished, Available, Finished)


üìÖ 1. CREATING dim_date
  üìä Unique dates extracted: 34
  ‚úÖ Records generated: 34
  üìã Columns: 11
  üíæ Table dim_date created



In [4]:
# ============================================
# 2. DIM_GEOGRAPHY
# ============================================
print("=" * 80)
print("üåç 2. CREATING dim_geography")
print("=" * 80)

# ‚úÖ SOLO TABLAS _month (eliminado _day y union)
df_geo = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_month") \
    .select("geo_id", "geo_name") \
    .distinct()

print(f"  üìä Unique geographies: {df_geo.count():,}")

# Add surrogate key
window = Window.orderBy("geo_id")
dim_geography = df_geo.withColumn("geography_key", row_number().over(window))

# Reorder - only geo_id and geo_name
dim_geography = dim_geography.select(
    "geography_key", 
    "geo_id", 
    "geo_name"
)

print(f"  ‚úÖ Records generated: {dim_geography.count():,}")

# Show sample to verify no duplicates
print("\n  üìã Sample records:")
dim_geography.orderBy("geo_id").show(10, truncate=False)

# Write
dim_geography.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{LAKEHOUSE_GOLD}.dim_geography")

print("  üíæ Table dim_geography created\n")


StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 6, Finished, Available, Finished)

üåç 2. CREATING dim_geography
  üìä Unique geographies: 20
  ‚úÖ Records generated: 20

  üìã Sample records:
+-------------+------+----------------------+
|geography_key|geo_id|geo_name              |
+-------------+------+----------------------+
|1            |4     |Andaluc√≠a             |
|2            |5     |Arag√≥n                |
|3            |6     |Cantabria             |
|4            |7     |Castilla-La Mancha    |
|5            |8     |Castilla y Le√≥n       |
|6            |9     |Catalu√±a              |
|7            |10    |Pa√≠s Vasco            |
|8            |11    |Principado de Asturias|
|9            |13    |Comunidad de Madrid   |
|10           |14    |Comunidad de Navarra  |
+-------------+------+----------------------+
only showing top 10 rows

  üíæ Table dim_geography created



In [5]:
# ============================================
# 3. DIM_TECHNOLOGY
# ============================================
print("=" * 80)
print("‚ö° 3. CREATING dim_technology")
print("=" * 80)

# ‚úÖ PASO 1: Obtener categor√≠as SOLO de tabla _month
df_base = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_month") \
    .select("series_type", "series_attribute_type") \
    .distinct()

print(f"  üìä Technologies in generation structure: {df_base.count():,}")

# ‚úÖ PASO 2: Obtener emisiones SOLO de tabla _month
df_emissions = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_emisiones_asociadas_month") \
    .select("series_type", col("series_attribute_type").alias("emissions_attr")) \
    .distinct()

print(f"  üìä Technologies in emissions table: {df_emissions.count():,}")

# PASO 3: JOIN para combinar ambas fuentes
dim_technology = df_base.join(df_emissions, "series_type", "left")

# PASO 4: Crear columnas finales
dim_technology = dim_technology.withColumn(
    "category",
    col("series_attribute_type")
).withColumn(
    "has_co2_emissions",
    when(col("emissions_attr") == "Con emisiones de CO2 eq.", True)
    .when(col("emissions_attr") == "Sin emisiones de CO2 eq.", False)
    .otherwise(None)  # NULL para casos sin info
)

# Drop auxiliary columns
dim_technology = dim_technology.drop("series_attribute_type", "emissions_attr")

print(f"  ‚úÖ Technologies processed: {dim_technology.count():,}")

# Add surrogate key
window = Window.orderBy("series_type")
dim_technology = dim_technology.withColumn("technology_key", row_number().over(window))

# Reorder
dim_technology = dim_technology.select(
    "technology_key", 
    "series_type", 
    "category", 
    "has_co2_emissions"
)

print(f"  ‚úÖ Records generated: {dim_technology.count():,}")

# Verify distribution
print("\n  üìä Distribution by category:")
dim_technology.groupBy("category").count().orderBy("category").show()

print("\n  üìä CO2 emissions distribution:")
dim_technology.groupBy("has_co2_emissions").count().show()

print("\n  ‚úÖ Technologies WITH emissions:")
dim_technology.filter(col("has_co2_emissions") == True) \
    .select("series_type", "category") \
    .orderBy("series_type") \
    .show(20, truncate=False)

print("\n  ‚ùå Technologies WITHOUT emissions:")
dim_technology.filter(col("has_co2_emissions") == False) \
    .select("series_type", "category") \
    .orderBy("series_type") \
    .show(20, truncate=False)

# Write
dim_technology.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{LAKEHOUSE_GOLD}.dim_technology")

print("  üíæ Table dim_technology created\n")

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 7, Finished, Available, Finished)

‚ö° 3. CREATING dim_technology
  üìä Technologies in generation structure: 18
  üìä Technologies in emissions table: 16
  ‚úÖ Technologies processed: 18
  ‚úÖ Records generated: 18

  üìä Distribution by category:
+----------------+-----+
|        category|count|
+----------------+-----+
|Generaci√≥n total|    1|
|    No-Renovable|    9|
|       Renovable|    8|
+----------------+-----+


  üìä CO2 emissions distribution:
+-----------------+-----+
|has_co2_emissions|count|
+-----------------+-----+
|             NULL|    1|
|             true|    9|
|            false|    8|
+-----------------+-----+


  ‚úÖ Technologies WITH emissions:
+----------------------+------------+
|series_type           |category    |
+----------------------+------------+
|Carb√≥n                |No-Renovable|
|Ciclo combinado       |No-Renovable|
|Cogeneraci√≥n          |No-Renovable|
|Fuel + Gas            |No-Renovable|
|Motores di√©sel        |No-Renovable|
|Residuos no renovables|Renovable   |
|Resid

In [6]:
# ============================================
# FINAL SUMMARY
# ============================================
print("=" * 80)
print("üìä SUMMARY OF CREATED DIMENSIONS")
print("=" * 80)

for dim_table in ["dim_date", "dim_geography", "dim_technology"]:
    count = spark.table(f"{LAKEHOUSE_GOLD}.{dim_table}").count()
    cols = len(spark.table(f"{LAKEHOUSE_GOLD}.{dim_table}").columns)
    print(f"  ‚úÖ {dim_table}: {count:,} records, {cols} columns")

print("\n" + "=" * 80)
print("‚úÖ DIMENSIONS COMPLETED (MONTHLY DATA)")
print("=" * 80)

StatementMeta(, 47c73c09-62e2-4f61-9de1-7307e52ed9b8, 8, Finished, Available, Finished)

üìä SUMMARY OF CREATED DIMENSIONS
  ‚úÖ dim_date: 34 records, 11 columns
  ‚úÖ dim_geography: 20 records, 3 columns
  ‚úÖ dim_technology: 18 records, 4 columns

‚úÖ DIMENSIONS COMPLETED (MONTHLY DATA)
