In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, coalesce

StatementMeta(, 84dc1672-c5f3-4b95-8b23-da532d192b93, 3, Finished, Available, Finished)

In [2]:
# ============================================
# CONFIGURATION
# ============================================
LAKEHOUSE_SILVER = "lh_silver"
LAKEHOUSE_GOLD = "lh_golden"

spark = SparkSession.builder.appName("Gold_Facts").getOrCreate()

print("=" * 80)
print("üìä CREATING GOLD FACT TABLES (MONTHLY DATA ONLY)")
print("=" * 80)
print(f"üìÇ Source: {LAKEHOUSE_SILVER}")
print(f"üìÇ Target: {LAKEHOUSE_GOLD}\n")

StatementMeta(, 84dc1672-c5f3-4b95-8b23-da532d192b93, 4, Finished, Available, Finished)

üìä CREATING GOLD FACT TABLES (MONTHLY DATA ONLY)
üìÇ Source: lh_silver
üìÇ Target: lh_golden



In [3]:
# ============================================
# LOAD DIMENSIONS
# ============================================
print("üîó Loading dimensions...")
dim_date = spark.table(f"{LAKEHOUSE_GOLD}.dim_date")
dim_geography = spark.table(f"{LAKEHOUSE_GOLD}.dim_geography")
dim_technology = spark.table(f"{LAKEHOUSE_GOLD}.dim_technology")
print(f"  ‚úÖ dim_date: {dim_date.count():,}")
print(f"  ‚úÖ dim_geography: {dim_geography.count():,}")
print(f"  ‚úÖ dim_technology: {dim_technology.count():,}\n")

StatementMeta(, 84dc1672-c5f3-4b95-8b23-da532d192b93, 5, Finished, Available, Finished)

üîó Loading dimensions...
  ‚úÖ dim_date: 34
  ‚úÖ dim_geography: 20
  ‚úÖ dim_technology: 18



In [4]:
# ============================================
# 1. FACT_GENERATION_MONTH
# ============================================
print("=" * 80)
print("‚ö° 1. CREATING fact_generation_month")
print("=" * 80)

# Read monthly generation data
df_gen_month = spark.table(f"{LAKEHOUSE_SILVER}.slv_redata_generacion_estructura_generacion_month") \
    .select("datetime", "geo_id", "series_type", "value", "percentage")

print(f"  üìä Records in Silver: {df_gen_month.count():,}")

# Join with dimensions
fact_gen_month = df_gen_month \
    .join(dim_date, df_gen_month.datetime == dim_date.date, "left") \
    .join(dim_geography, df_gen_month.geo_id == dim_geography.geo_id, "left") \
    .join(dim_technology, df_gen_month.series_type == dim_technology.series_type, "left") \
    .select(
        col("date_key"),
        col("geography_key"),
        col("technology_key"),
        col("value").alias("generation_mwh"),
        col("percentage").alias("generation_percentage"),
        year("datetime").alias("year")
    )

# Validate nulls
nulls_date = fact_gen_month.filter(col("date_key").isNull()).count()
nulls_geo = fact_gen_month.filter(col("geography_key").isNull()).count()
nulls_tech = fact_gen_month.filter(col("technology_key").isNull()).count()

if nulls_date > 0 or nulls_geo > 0 or nulls_tech > 0:
    print(f"  ‚ö†Ô∏è  Nulls detected - date: {nulls_date}, geo: {nulls_geo}, tech: {nulls_tech}")
else:
    print(f"  ‚úÖ No nulls in foreign keys")

print(f"  ‚úÖ Final records: {fact_gen_month.count():,}")

# Write with partitioning
fact_gen_month.write.format("delta").mode("overwrite") \
    .partitionBy("year") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{LAKEHOUSE_GOLD}.fact_generation_month")

print("  üíæ Table fact_generation_month created (partitioned by year)\n")


StatementMeta(, 84dc1672-c5f3-4b95-8b23-da532d192b93, 6, Finished, Available, Finished)

‚ö° 1. CREATING fact_generation_month
  üìä Records in Silver: 5,186
  ‚úÖ No nulls in foreign keys
  ‚úÖ Final records: 5,559
  üíæ Table fact_generation_month created (partitioned by year)



In [5]:
# ============================================
# FINAL SUMMARY
# ============================================
print("=" * 80)
print("üìä SUMMARY OF CREATED FACTS")
print("=" * 80)

tables = [
    ("fact_generation_month", True),
]

for table_name, is_partitioned in tables:
    df_check = spark.table(f"{LAKEHOUSE_GOLD}.{table_name}")
    count = df_check.count()
    cols = len(df_check.columns)
    partition_info = " (partitioned)" if is_partitioned else ""
    print(f"  ‚úÖ {table_name}: {count:,} records, {cols} columns{partition_info}")

print("\n" + "=" * 80)
print("‚úÖ STAR SCHEMA COMPLETED (MONTHLY DATA)")
print("=" * 80)
print("\nüìå NEXT STEPS:")
print("  1Ô∏è‚É£  Create Semantic Model in Power BI")
print("  2Ô∏è‚É£  Define relationships between tables")
print("  3Ô∏è‚É£  Create DAX measures (Total Generation, % Renewable, etc.)")
print("  4Ô∏è‚É£  Develop visualizations")

StatementMeta(, 84dc1672-c5f3-4b95-8b23-da532d192b93, 7, Finished, Available, Finished)

üìä SUMMARY OF CREATED FACTS
  ‚úÖ fact_generation_month: 5,559 records, 6 columns (partitioned)

‚úÖ STAR SCHEMA COMPLETED (MONTHLY DATA)

üìå NEXT STEPS:
  1Ô∏è‚É£  Create Semantic Model in Power BI
  2Ô∏è‚É£  Define relationships between tables
  3Ô∏è‚É£  Create DAX measures (Total Generation, % Renewable, etc.)
  4Ô∏è‚É£  Develop visualizations
