# Gold - Star Schema

### 1.0 - Configuration

In [0]:
%run ./00_utils

### 2.0 - Dimension: Locations

In [0]:
df_dim_locations = spark.sql(f"""
    SELECT 
        id as location_id,
        name as location_name,
        locality,
        latitude,
        longitude,
        country_code,
        country_name,
        owner_name,
        provider_name,
        timezone
    FROM {CATALOG}.silver.locations
""")

df_dim_locations.write.mode("overwrite").saveAsTable(f"{CATALOG}.gold.dim_locations")
print(f"Saved {df_dim_locations.count()} rows to gold.dim_locations")

### 3.0 - Dimension: Parameters

In [0]:
df_dim_parameters = spark.sql(f"""
    SELECT 
        parameter_id,
        parameter_name,
        parameter_units,
        parameter_display_name
    FROM {CATALOG}.silver.parameters
""")

df_dim_parameters.write.mode("overwrite").saveAsTable(f"{CATALOG}.gold.dim_parameters")
print(f"Saved {df_dim_parameters.count()} rows to gold.dim_parameters")

### 4.0 - Dimension: Sensors

In [0]:
df_dim_sensors = spark.sql(f"""
    SELECT DISTINCT
        sensor_id,
        sensor_name
    FROM {CATALOG}.silver.sensors
""")

df_dim_sensors.write.mode("overwrite").saveAsTable(f"{CATALOG}.gold.dim_sensors")
print(f"Saved {df_dim_sensors.count()} rows to gold.dim_sensors")

### 5.0 - Dimension: Date

In [0]:
df_dim_date = spark.sql(f"""
    SELECT DISTINCT
        CAST(DATE(datetime_utc) AS DATE) as date_id,
        YEAR(datetime_utc) as year,
        MONTH(datetime_utc) as month,
        DAY(datetime_utc) as day,
        DAYOFWEEK(datetime_utc) as day_of_week,
        DAYNAME(datetime_utc) as day_name,
        WEEKOFYEAR(datetime_utc) as week_of_year,
        QUARTER(datetime_utc) as quarter
    FROM {CATALOG}.silver.measurements
    WHERE datetime_utc IS NOT NULL
""")

df_dim_date.write.mode("overwrite").saveAsTable(f"{CATALOG}.gold.dim_date")
print(f"Saved {df_dim_date.count()} rows to gold.dim_date")

### 6.0 - Fact: Measurements

In [0]:
df_fact_measurements = spark.sql(f"""
    SELECT 
        sensors_id as sensor_id,
        locations_id as location_id,
        CAST(DATE(datetime_utc) AS DATE) as date_id,
        datetime_utc,
        value,
        ingested_at
    FROM {CATALOG}.silver.measurements
""")

df_fact_measurements.write.mode("overwrite").saveAsTable(f"{CATALOG}.gold.fact_measurements")
print(f"Saved {df_fact_measurements.count()} rows to gold.fact_measurements")

### 7.0 - Verify Star Schema

In [0]:
print("dim_locations samples")
spark.sql(f"SELECT * FROM {CATALOG}.gold.dim_locations LIMIT 5").display()

print("dim_parameters samples")
spark.sql(f"SELECT * FROM {CATALOG}.gold.dim_parameters LIMIT 5").display()

print("dim_sensors samples")
spark.sql(f"SELECT * FROM {CATALOG}.gold.dim_sensors LIMIT 5").display()

print("dim_date samples")
spark.sql(f"SELECT * FROM {CATALOG}.gold.dim_date LIMIT 5").display()

print("fact_measurements samples")
spark.sql(f"SELECT * FROM {CATALOG}.gold.fact_measurements LIMIT 5").display()

print("Row Counts")
spark.sql(f"""
    SELECT 'dim_locations' as table_name, COUNT(*) as rows FROM {CATALOG}.gold.dim_locations
    UNION ALL
    SELECT 'dim_parameters', COUNT(*) FROM {CATALOG}.gold.dim_parameters
    UNION ALL
    SELECT 'dim_sensors', COUNT(*) FROM {CATALOG}.gold.dim_sensors
    UNION ALL
    SELECT 'dim_date', COUNT(*) FROM {CATALOG}.gold.dim_date
    UNION ALL
    SELECT 'fact_measurements', COUNT(*) FROM {CATALOG}.gold.fact_measurements
""").display()