In [0]:
%run ./01-config

In [0]:
%run ./02-setup

In [0]:
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.functions import lit, current_timestamp, col, regexp_replace, upper, sha2, concat_ws
import requests

class Gold():
    def __init__(self):

        print("Loading configuration settings...")
        conf = Config()
        self.dim_borough_population = "dim_borough_population"
        self.dim_restaurant = "dim_restaurant"
        self.dim_violation = "dim_violation"
        self.fact_restaurant_inspection = "fact_restaurant_inspections"
        self.gold_path = conf.storage_account + "gold_db/"
        self.catalog_name = conf.catalog_name
        self.silver_db_name = "silver_db"
        self.db_name = "gold_db"
        print("✅ Configuration Loaded:")
        print(f"   - Storage Path for Silver Layer: {self.gold_path}")
        print(f"   - Catalog Name: {self.catalog_name}")
        print(f"   - Database Name: {self.db_name}")
        print("🚀 Gold Ingestion Initialized Successfully! 🎯\n")

    def table_exists(self, gold_table_name):
        """Check if a Delta table exists in the catalog."""
        try:
            spark.table(gold_table_name)
            return True
        except:
            return False
    
    def inspection_detail(self):

        """Load NYC Detalied inspection data into the Gold layer with incremental loading."""
        
        print("🔹 Starting gold layer data ingestion for NYC Detalied inspection data...")

        gold_table_name = f"{self.catalog_name}.{self.db_name}.inspection_detail"
        inspection_detail_storage_path = self.gold_path + "inspection_detail/"
        ingestion_timestamp = datetime.utcnow()

        silver_data = spark.sql(f"""
                                    SELECT
                                    r.borough,
                                    r.street_name,
                                    r.restaurant_id,
                                    r.restaurant_name,
                                    r.cuisine_type,
                                    r.longitude,
                                    r.latitude,
                                    i.violation_code,
                                    v.violation_description,
                                    i.inspection_date,
                                    i.inspection_type,
                                    i.critical_flag,
                                    i.score,
                                    i.grade,
                                    i.action
                                    FROM {self.catalog_name}.{self.silver_db_name}.{self.fact_restaurant_inspection} i
                                    JOIN {self.catalog_name}.{self.silver_db_name}.{self.dim_restaurant} r 
                                        ON i.restaurant_id = r.restaurant_id
                                    JOIN {self.catalog_name}.{self.silver_db_name}.{self.dim_violation} v
                                        ON i.violation_code = v.violation_code
                                    where r.borough <> 'NOT AVAILABLE' and r.end_date is null and v.end_date is null
                                """)

        if self.table_exists(gold_table_name):
            print(f"📌 Table {gold_table_name} exists. Performing incremental load...")

            key_columns = [col for col in silver_data.columns if col != "ingestion_timestamp"]

            df = silver_data.withColumn("ingestion_timestamp", lit(ingestion_timestamp))\
                    .withColumn("unique_hash", sha2(concat_ws("|", *[silver_data[col] for col in key_columns]), 256))\
                    .dropDuplicates(["unique_hash"])

            df.createOrReplaceTempView("temp_view")

            existing_df = spark.table(gold_table_name)
            existing_df = existing_df.drop("ingestion_timestamp")
            print(f"🔍 Existing record count: {existing_df.count()}")
            silver_df = df.drop("ingestion_timestamp")
            new_df = silver_df.exceptAll(existing_df)
            new_records_count = new_df.count()
            print(f"🆕 New records to insert: {new_records_count}")

            if new_records_count > 0:
                spark.sql(f"""
                                MERGE INTO {gold_table_name} AS TARGET
                                USING temp_view AS SOURCE
                                ON TARGET.unique_hash = SOURCE.unique_hash
                                WHEN MATCHED AND TARGET.INGESTION_TIMESTAMP < SOURCE.INGESTION_TIMESTAMP THEN 
                                UPDATE SET *
                                WHEN NOT MATCHED THEN 
                                INSERT *
                            """)
                print(f"✅ {new_records_count} new records inserted into {gold_table_name}.")
            else:
                print("✅ No new records to insert. Data is already up-to-date.")

        else:
            print(f"🛠️ Table {gold_table_name} does not exist. Creating Table...")
            spark.sql(f"""
                           CREATE TABLE IF NOT EXISTS {gold_table_name} (
                                borough STRING,
                                street_name STRING,
                                restaurant_id INT,
                                restaurant_name STRING,
                                cuisine_type STRING,
                                longitude DOUBLE,
                                latitude DOUBLE,
                                violation_code STRING,
                                violation_description STRING,
                                inspection_date DATE,
                                inspection_type STRING,
                                critical_flag STRING,
                                score INT,
                                grade STRING,
                                action STRING,
                                unique_hash STRING,
                                ingestion_timestamp TIMESTAMP)
                            USING DELTA
                            LOCATION '{inspection_detail_storage_path}'
                        """)
            print(f"✅ Table {gold_table_name} created successfully!")
            print("Performing first-time load...")

            key_columns = [col for col in silver_data.columns if col != "ingestion_timestamp"]

            df = silver_data.withColumn("ingestion_timestamp", lit(ingestion_timestamp))\
                    .withColumn("unique_hash", sha2(concat_ws("|", *[silver_data[col] for col in key_columns]), 256))\
                    .dropDuplicates(["unique_hash"])

            df.write.mode("overwrite").saveAsTable(gold_table_name)
            print(f"✅ First-time load completed! {df.count()} rows loaded successfully to {gold_table_name}")

    def inspection_summary(self):
        """Load NYC Summary inspection data into the Gold layer as a View."""
    
        print("🔹 Starting gold layer view creation for NYC summarized inspection data...")

        gold_view_name = f"{self.catalog_name}.{self.db_name}.inspection_summary_view"
        
        spark.sql(f"""
            CREATE OR REPLACE VIEW {gold_view_name} AS
            SELECT
                borough,
                COUNT(DISTINCT restaurant_id) AS total_restaurants,
                COUNT(restaurant_id) AS total_inspections,
                AVG(score) AS avg_inspection_score,
                SUM(CASE WHEN critical_flag = 'CRITICAL' THEN 1 ELSE 0 END) AS critical_violations,
                SUM(CASE WHEN violation_code <> 'NO VIOLATIONS' THEN 1 ELSE 0 END) AS total_violations,
                (SUM(CASE WHEN critical_flag = 'CRITICAL' THEN 1 ELSE 0 END) * 100.0 / NULLIF(COUNT(restaurant_id), 0)) AS pct_critical_violations
            FROM {self.catalog_name}.{self.db_name}.inspection_detail
            GROUP BY borough
        """)

        print(f"✅ Successfully created or updated Gold View: {gold_view_name}")

    def inspection_summary_by_borough_population(self):

        """Load NYC Summary inspection data by Borough Population into the Gold layer as a View."""
    
        print("🔹 Starting gold layer view creation for NYC summarized inspection data by Borough Population...")

        gold_view_name = f"{self.catalog_name}.{self.db_name}.inspection_summary_by_borough_population_view"
        
        spark.sql(f"""
            CREATE OR REPLACE VIEW {gold_view_name} AS
            SELECT 
                p.borough,
                p.year,
                ir.total_restaurants,
                ir.total_inspections,
                ir.avg_inspection_score,
                ir.critical_violations,
                ir.total_violations,
                ir.pct_critical_violations,
                sum(p.population) as total_borough_population
            FROM {self.catalog_name}.{self.silver_db_name}.dim_borough_population p
            LEFT JOIN {self.catalog_name}.{self.db_name}.inspection_summary_view ir
                ON p.borough = ir.borough
            GROUP BY 1,2,3,4,5,6,7,8
            ORDER BY 1,2
        """)

        print(f"✅ Successfully created or updated Gold View: {gold_view_name}")

    def violation_summary_by_cuisine(self):

        """Load NYC Violation data by Cuisine types into the Gold layer as a View."""
    
        print("🔹 Starting gold layer view creation for NYC Violation data by Cuisine types...")

        gold_view_name = f"{self.catalog_name}.{self.db_name}.violation_summary_by_cuisine_view"
        
        spark.sql(f"""
            CREATE OR REPLACE VIEW {gold_view_name} AS
            SELECT 
                cuisine_type,
                violation_code,
                violation_description,
                COUNT(restaurant_id) AS total_violations,
                SUM(CASE WHEN critical_flag = 'CRITICAL' THEN 1 ELSE 0 END) AS critical_violations   
            FROM {self.catalog_name}.{self.db_name}.inspection_detail
            GROUP BY 1,2,3
        """)

        print(f"✅ Successfully created or updated Gold View: {gold_view_name}")

    def best_restaurant_locations(self):

        """Load NYC best restaurant data into the Gold layer as a View."""
    
        print("🔹 Starting gold layer view creation for NYC best restaurant locations...")

        gold_view_name = f"{self.catalog_name}.{self.db_name}.best_restaurant_locations_view"
        
        spark.sql(f"""
            CREATE OR REPLACE VIEW {gold_view_name} AS
                with restaurant_distribution as (
                    select 
                        borough,
                        count(distinct restaurant_id) as total_restaurants,
                        avg(score) as avg_inspection_score
                    from {self.catalog_name}.{self.db_name}.inspection_detail
                    group by borough
                    )
                , population_data as (
                    select 
                        borough,
                        sum(population) as population
                    from {self.catalog_name}.{self.silver_db_name}.dim_borough_population
                    where year = (select max(year) from {self.catalog_name}.{self.silver_db_name}.dim_borough_population)
                    group by borough
                    )

                SELECT 
                    p.borough,
                    p.population,
                    rd.total_restaurants,
                    (rd.total_restaurants * 1.0 / NULLIF(p.population, 0)) * 1000 AS restaurants_per_1000_people,
                    rd.avg_inspection_score
                FROM population_data p
                LEFT JOIN restaurant_distribution rd 
                    ON p.borough = rd.borough
                ORDER BY restaurants_per_1000_people ASC
            """)

        print(f"✅ Successfully created or updated Gold View: {gold_view_name}")

    
    def gold_layer_execution(self):

        setup = Setup("gold_db")
        setup.create_db()

        self.inspection_detail()
        self.inspection_summary()
        self.inspection_summary_by_borough_population()
        self.violation_summary_by_cuisine()
        self.best_restaurant_locations()

        print(f"🎯 Data processing for Gold completed successfully.\n")

In [0]:
gold = Gold()
gold.gold_layer_execution()