**Import Libraries**

In [0]:
import pandas as pd
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, lit, to_date, log10, round
from datetime import datetime, timedelta


**Logging**

In [0]:
spark = SparkSession.builder.appName("Neo Approaches").getOrCreate()

def log_event(level, message):
    log_df = spark.createDataFrame([(datetime.now(), level, message)], ["timestamp", "level", "message"])
    log_df.write.format("delta").mode("append").saveAsTable("neo_logs")

**Silver layer - clean, joined (transform)**

In [0]:
def transform_silver():

    df_neos = spark.table("neos")
    df_approaches = spark.table("approaches")
    df_orbit = spark.table("neo_orbits")

    df_silver = df_approaches.join(df_neos, df_approaches.neo_id == df_neos.id, "inner").withColumn("close_approach_date", to_date("close_approach_date")).join(df_orbit, df_neos.id == df_orbit.orbit_id, "inner")

    df_silver = df_silver.withColumn("size_avg", (df_silver["estimated_diameter_min"] + df_silver["estimated_diameter_max"]) / 2 )

    # AC 1: Add size_category column
    try:
        if 'size_category' not in df_silver.columns:
            # add "size_category"
            df_silver = df_silver.withColumn("size_category", when(col("size_avg") < 10, "<10m")
                                .when(col("size_avg") < 50, "10-50m")
                                .when(col("size_avg") < 100, "50-100m")
                                .when(col("size_avg") < 500, "100m-500m")
                                .otherwise(">500m"))
            
            # save enhanced gold table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added size_category column.")
        else:
            log_event("INFO", "size_category column already exists!")

    except Exception as e:
        log_event("ERROR", f"Error: adding size_category column: {str(e)}")
        raise e

    # AC 1: Add size_category_label (Small, Medium , Large) column
    try:
        if 'size_category_label' not in df_silver.columns:
            # add "size_category"
            df_silver = df_silver.withColumn("size_category_label", when(col("estimated_diameter_max") < 150, "Small")
                                .when(col("estimated_diameter_max") < 500, "Medium")
                                .otherwise("Large"))
            
            # save enhanced gold table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added size_category_label column.")
        else:
            log_event("INFO", "size_category_label column already exists!")

    except Exception as e:
        log_event("ERROR", f"Error: adding size_category_label column: {str(e)}")
        raise e

    # AC 2: Add "in_50yr_window" column

    # define "50year windows"
    today = datetime.today().date()
    past_25 = today - timedelta(days=25*365)
    future_25 = today + timedelta(days=25*365)
    try:
        if 'in_50yr_window' not in df_silver.columns:
            # add "in_50yr_window"
            df_silver = df_silver.withColumn("in_50yr_window", when((col("close_approach_date") >= lit(past_25)) & ((col("close_approach_date") <= lit(future_25))), True)
                                             .otherwise(False))
        
            # save enhanced gold table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added in_50yr_window column.")
        else:
            log_event("INFO", "in_50yr_window column already exists!")

    except Exception as e:
        log_event("ERROR", f"Error: adding in_50yr_window column: {str(e)}")
        raise e

    # AC 3: New object category based on scientific criteria (assumption)
    #  NASA considers an object potentially hazardous if:
    #  Its minimum orbit intersection distance (MOID) with Earth is less than 0.05 AU (~7.5 million km)
    #  AND its absolute magnitude (H) is less than 22, implying a diameter ≥ ~140 meters
    try:
        if 'hazard_category' not in df_silver.columns:
            #df_approach = spark.table("neo_approaches")
            df_silver = df_silver.withColumn("hazard_category", when(
                                                            ( col("is_potentially_hazardous") == True) & 
                                                            ( col("miss_distance_km") < 7500000) &
                                                            ( col("absolute_magnitude_h") < 22),
                                                            "High Risk"
                                                            ).when(
                                                            (col("is_potentially_hazardous") == True),
                                                            "Moderate Risk"
                                                            ).otherwise("Low Risk")
                                            ) 

            # save enriched silver table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added hazard_category column.")
        else:
            log_event("INFO", "hazard_category column already exists!")
    except Exception as e:
        log_event("INFO", f"ERROR: adding hazard_column: {str(e)}")
        raise e

    # AC 4: Add columns to support comparative data visualizations showing relationships 
    # between NEO characteristics—specifically - size, - orbital period, - and approach distance
     
    try:
        if 'log_diameter_max' not in df_silver.columns:
            # add "log_diameter_max"
            df_silver = df_silver.withColumn("log_diameter_max", round(log10(col("estimated_diameter_max") + 1), 2))
            # save enriched gold table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added log_diameter_max column.")
        else:
            log_event("INFO", "log_diameter_max column already exists!")
    except Exception as e:
        log_event("ERROR", f"Error: adding log_diameter_max column: {str(e)}")
        raise e  

    try:
        if 'log_miss_distance' not in df_silver.columns:
            # add "log_miss_distance"
            df_silver = df_silver.withColumn("log_miss_distance", round(log10(col("miss_distance_km") + 1), 2))
            # save enriched gold table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added log_miss_distance column.")
        else:
            log_event("INFO", "log_miss_distance column already exists!")
    except Exception as e:
        log_event("ERROR", f"Error: adding log_miss_distance column: {str(e)}")
        raise e

    # AC 5:
    # US-2-AC 3: create distance categories
    try:
        if 'distance_category' not in df_silver.columns:
            # add 'distance_category'
            df_silver = df_silver.withColumn("distance_category", when(col("miss_distance_km") < 100000, "<100k km")
                                    .when(col("miss_distance_km") < 500000, "100k-500k km")
                                    .when(col("miss_distance_km") < 1000000, "500k-1M km")
                                    .when(col("miss_distance_km") < 5000000, "1M-5M km")
                                    .otherwise (">5M km"))
            # save enhanced gold table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added distance category column.")
        else:
            log_event("INFO", "distance_category column already exist!")

    except Exception as e:
        log_event("ERROR", f"Error: adding distance_category column: {str(e)}")
        raise e

    # US-2-AC 4: create "hazard_level" column
    try:
        if 'hazard_level' not in df_silver.columns:
            #add hazard_level column
            df_silver = df_silver.withColumn("hazard_level", when(col("is_potentially_hazardous"), "Hazardous")
                                    .otherwise("Non-Hazardous"))
            # save enhanced gold table
            df_silver.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("neo_approaches")
            log_event("SUCCESS", "Added hazard_level column.")
        else:
            log_event("INFO", "hazard_level column already exist!")

    except Exception as e:
        log_event("ERROR", f"Error: adding hazard_level column: {str(e)}")
        raise e



**Call function to build and load 'neo_approach' table**

In [0]:
try:
    log_event("INFO", "Neo transform started")

    transform_silver()

    log_event("SUCCESS", "Neo transform completed")   
except Exception as e:
    log_event("ERROR", f"Neo transform failed: {str(e)}")
    raise e