# Silver - Transformations

### 1.0 - Configuration

In [0]:
%run ./00_utils

### 2.0 - Silver Locations

In [0]:
from pyspark.sql import functions as F

df_locations_silver = spark.sql(f"""
    SELECT 
        id,
        name,
        locality,
        timezone,
        isMobile,
        isMonitor,
        CAST(get_json_object(coordinates, '$.latitude') AS DOUBLE) as latitude,
        CAST(get_json_object(coordinates, '$.longitude') AS DOUBLE) as longitude,
        get_json_object(country, '$.code') as country_code,
        get_json_object(country, '$.name') as country_name,
        get_json_object(owner, '$.name') as owner_name,
        get_json_object(provider, '$.name') as provider_name,
        CAST(get_json_object(datetimeLast, '$.utc') AS TIMESTAMP) as last_update_utc,
        ingested_at
    FROM {CATALOG}.{SCHEMA}.locations
    QUALIFY ROW_NUMBER() OVER (PARTITION BY id ORDER BY ingested_at DESC) = 1
""")

# Data quality: remove invalid coordinates
df_locations_clean = df_locations_silver.filter(
    (F.col("latitude").isNotNull()) & 
    (F.col("longitude").isNotNull()) &
    (F.col("latitude").between(-90, 90)) &
    (F.col("longitude").between(-180, 180))
)

df_locations_clean.write.mode("overwrite").saveAsTable(f"{CATALOG}.silver.locations")
print(f"Saved {df_locations_clean.count()} locations to silver (removed {df_locations_silver.count() - df_locations_clean.count()} invalid)")

### 3.0 - Silver Sensors

In [0]:
df_sensors_silver = spark.sql(f"""
    WITH parsed_locations AS (
        SELECT 
            id as location_id,
            from_json(sensors, 'ARRAY<STRUCT<id: INT, name: STRING, parameter: STRUCT<id: INT, name: STRING, units: STRING, displayName: STRING>>>') as sensors_array,
            ingested_at
        FROM {CATALOG}.{SCHEMA}.locations
        QUALIFY ROW_NUMBER() OVER (PARTITION BY id ORDER BY ingested_at DESC) = 1
    )
    SELECT DISTINCT
        sensor.id as sensor_id,
        sensor.name as sensor_name,
        sensor.parameter.id as parameter_id,
        location_id
    FROM parsed_locations
    LATERAL VIEW explode(sensors_array) AS sensor
    WHERE sensor.id IS NOT NULL
""")

df_sensors_silver.write.mode("overwrite").saveAsTable(f"{CATALOG}.silver.sensors")
print(f"Saved {df_sensors_silver.count()} sensors to silver")

### 4.0 - Silver Parameters

In [0]:
# Cell: Silver Parameters (extract from bronze FIRST)
df_parameters_silver = spark.sql(f"""
    WITH parsed_locations AS (
        SELECT 
            from_json(sensors, 'ARRAY<STRUCT<id: INT, name: STRING, parameter: STRUCT<id: INT, name: STRING, units: STRING, displayName: STRING>>>') as sensors_array
        FROM {CATALOG}.{SCHEMA}.locations
    )
    SELECT DISTINCT
        sensor.parameter.id as parameter_id,
        sensor.parameter.name as parameter_name,
        sensor.parameter.units as parameter_units,
        sensor.parameter.displayName as parameter_display_name
    FROM parsed_locations
    LATERAL VIEW explode(sensors_array) AS sensor
    WHERE sensor.parameter.id IS NOT NULL
""")

df_parameters_silver.write.mode("overwrite").saveAsTable(f"{CATALOG}.silver.parameters")
print(f"Saved {df_parameters_silver.count()} parameters to silver")

### 5.0 - Silver Measurements

In [0]:
df_measurements_silver = spark.sql(f"""
    SELECT 
        sensors_id,
        locations_id,
        value,
        CAST(get_json_object(datetime, '$.utc') AS TIMESTAMP) as datetime_utc,
        CAST(get_json_object(coordinates, '$.latitude') AS DOUBLE) as latitude,
        CAST(get_json_object(coordinates, '$.longitude') AS DOUBLE) as longitude,
        ingested_at
    FROM {CATALOG}.{SCHEMA}.measurements
""")

# Data quality: remove null values and invalid readings
df_measurements_clean = df_measurements_silver.filter(
    (F.col("value").isNotNull()) &
    (F.col("datetime_utc").isNotNull()) &
    (F.col("sensors_id").isNotNull()) &
    (F.col("locations_id").isNotNull()) &
    (F.col("value") >= 0)  # Air quality values should be positive
)

df_measurements_clean.write.mode("overwrite").saveAsTable(f"{CATALOG}.silver.measurements")
print(f"Saved {df_measurements_clean.count()} measurements to silver (removed {df_measurements_silver.count() - df_measurements_clean.count()} invalid)")

### 5.0 - Verify Silver tables

In [0]:
print("Locations samples")
spark.sql(f"SELECT * FROM {CATALOG}.silver.locations LIMIT 5").display()

print("Sensors samples")
spark.sql(f"SELECT * FROM {CATALOG}.silver.sensors LIMIT 5").display()

print("Parameters samples")
spark.sql(f"SELECT * FROM {CATALOG}.silver.parameters LIMIT 5").display()

print("Measurements samples")
spark.sql(f"SELECT * FROM {CATALOG}.silver.measurements LIMIT 5").display()

print("Row Counts")
spark.sql(f"""
    SELECT 'locations' as table_name, COUNT(*) as rows FROM {CATALOG}.silver.locations
    UNION ALL
    SELECT 'parameters', COUNT(*) FROM {CATALOG}.silver.parameters
    UNION ALL
    SELECT 'sensors', COUNT(*) FROM {CATALOG}.silver.sensors
    UNION ALL
    SELECT 'measurements', COUNT(*) FROM {CATALOG}.silver.measurements
""").display()