In [0]:
%run ./01-config

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
import requests
import uuid

class Bronze:
    def __init__(self):
        print("Initializing Spark Session...")
        self.spark = SparkSession.builder.appName("BronzeIngestion").getOrCreate()
        app_name = self.spark.conf.get("spark.app.name", "BronzeIngestion")
        print(f"🔹 Spark App Name: {app_name}")

        print("Loading configuration settings...")
        conf = Config()
        self.nyc_population_by_community_url = conf.nyc_population_by_community_url
        self.nyc_restaurant_inspection_url = conf.nyc_restaurant_inspection_url
        self.nyc_restaurant_inspection_bronze_path = conf.storage_account + "bronze_db/nyc_restaurant_inspection_raw"
        self.nyc_population_bronze_path = conf.storage_account + "bronze_db/nyc_population_by_community_raw"
        self.catalog_name = conf.catalog_name
        self.db_name = "bronze_db"
        print("✅ Configuration Loaded:")
        print(f"   - NYC Restaurant Inspection URL: {self.nyc_restaurant_inspection_url}")
        print(f"   - NYC Population by Community URL: {self.nyc_population_by_community_url}")
        print(f"   - Storage Path for Restaurant Inspection: {self.nyc_restaurant_inspection_bronze_path}")
        print(f"   - Storage Path for Population Data: {self.nyc_population_bronze_path}")
        print(f"   - Catalog Name: {self.catalog_name}")
        print(f"   - Database Name: {self.db_name}")
        print("🚀 Bronze Ingestion Initialized Successfully! 🎯\n")

    def table_exists(self, table_name):
        """Check if a Delta table exists in the catalog."""
        try:
            self.spark.table(table_name)
            return True
        except:
            return False

    def load_nyc_restaurant_inspections(self):
        """Load NYC restaurant inspection data into the Bronze layer with incremental loading."""
        process_id = str(uuid.uuid4())
        print("🔹 Starting data ingestion for NYC restaurant inspections...")

        print("📡 Fetching data from API...")
        count_responses = requests.get(f"{self.nyc_restaurant_inspection_url}?$select=count(*)")
        if count_responses.status_code == 200:
            total_rows = int(count_responses.json()[0]["count"])
        else:
            raise Exception("Error fetching Row Count")

        response = requests.get(f"{self.nyc_restaurant_inspection_url}?$limit={total_rows}")
        if response.status_code != 200:
            raise Exception(f"❌ API Error: {response.status_code}")

        data = response.json()
        
        if not data:
            raise Exception("⚠️ No Data Returned from API")

        print(f"✅ Successfully fetched {len(data)} records from API.")

        df = self.spark.createDataFrame(data)
        df = df.withColumn("process_id", lit(process_id))\
               .withColumn("source_file", lit(self.nyc_restaurant_inspection_url))\
               .withColumn("ingestion_timestamp", current_timestamp())

        bronze_table = f"{self.catalog_name}.{self.db_name}.nyc_restaurant_inspection_raw"

        if self.table_exists(bronze_table):
            print(f"📌 Table {bronze_table} exists. Performing incremental load...")
            existing_df = self.spark.table(bronze_table)

            print(f"🔍 Existing record count: {existing_df.count()}")
            
            df = df.join(existing_df, "inspection_date", "left_anti")

            new_records_count = max(df.count() - existing_df.count(), 0)
            print(f"🆕 New records to insert: {new_records_count}")

            if new_records_count > 0:
                df.write.format("delta").mode("append").saveAsTable(bronze_table)
                print(f"✅ {new_records_count} new records inserted into {bronze_table}.")
            else:
                print("✅ No new records to insert. Data is already up-to-date.")

        else:
            print(f"🛠️ Table {bronze_table} does not exist. Performing first-time load...")
            df.write.format("delta").mode("overwrite").saveAsTable(bronze_table)
            print(f"✅ Table {bronze_table} created and full dataset loaded.")

        print(f"🎯 Data processing for {bronze_table} completed successfully.\n")

    def load_nyc_population_by_community(self):
        """Load NYC population data into the Bronze layer with incremental loading."""
        process_id = str(uuid.uuid4())
        print("🔹 Starting data ingestion for NYC population by community...")

        print("📡 Fetching data from API...")
        response = requests.get(self.nyc_population_by_community_url)

        if response.status_code != 200:
            print(f"❌ API Error: {response.status_code}")
            raise Exception(f"API Error: {response.status_code}")

        data = response.json()
        if not data:
            print("⚠️ No Data Returned from API")
            raise Exception("No Data Returned from API")

        print(f"✅ Successfully fetched {len(data)} records from API.")

        df = self.spark.createDataFrame(data)
        df = df.withColumn("process_id", lit(process_id))\
               .withColumn("source_file", lit(self.nyc_population_by_community_url))\
               .withColumn("ingestion_timestamp", current_timestamp())

        bronze_table = f"{self.catalog_name}.{self.db_name}.nyc_population_by_community_raw"

        if self.table_exists(bronze_table):
            print(f"📌 Table {bronze_table} exists. Performing incremental load...")
            existing_df = self.spark.table(bronze_table)

            print(f"🔍 Existing record count: {existing_df.count()}")

            df = df.join(existing_df, ["cd_name", "borough"], "left_anti")

            new_records_count = df.count()
            print(f"🆕 New records to insert: {new_records_count}")

            if new_records_count > 0:
                df.write.format("delta") \
                        .mode("append") \
                        .option("mergeSchema", "true") \
                        .saveAsTable(bronze_table)
                print(f"✅ {new_records_count} new records inserted into {bronze_table}.")
            else:
                print("✅ No new records to insert. Data is already up-to-date.")

        else:
            print(f"🛠️ Table {bronze_table} does not exist. Performing first-time load...")
            df.write.format("delta") \
                    .mode("overwrite") \
                    .option("mergeSchema", "true") \
                    .saveAsTable(bronze_table)
            print(f"✅ Table {bronze_table} created and full dataset loaded.")

        print(f"🎯 Data processing for {bronze_table} completed successfully.\n")
