Gold Notebook - Enrich silver data with country information and write to gold layer

In [0]:
# Get path to silver output data

silver_file_path = dbutils.jobs.taskValues.get(taskKey="silver",key="silver_output",default="")
# receieved file_path as a string from the silver NB and set default as "" in case of missing values
bronze_output = dbutils.jobs.taskValues.get(taskKey="bronze",key="bronze_output")

gold_adls = bronze_output.get("gold_adls","")


In [0]:
# Import geocoding and transformation libraries

from pyspark.sql.functions import when, col, udf
import reverse_geocoder as rg
from pyspark.sql.types import StringType
from datetime import date, timedelta



In [0]:
# Filter only data from last 2 days

start_date = date.today() - timedelta(2)
df = spark.read.format("delta").load(silver_file_path).filter(col("event_date")>start_date)

In [0]:
# UDF to convert lat/long into country code using reverse_geocoder

def get_country_code(lat,long):
    try:
        coordinates = (float(lat), float(long))
        result = rg.search(coordinates)[0].get('cc')
        print(f"Processed coordinates: {coordinates} -> {result}")
        return result
    except Exception as e:
        print(f"Error processing coordinates: {lat}, {long} -> {str(e)}")
        return None



In [0]:
# Register UDF

get_country_code_udf = udf(get_country_code, StringType())


In [0]:
# Apply UDF to enrich each row with country code

df = df.withColumn("Country",get_country_code_udf(col("latitude"),col("longitude")))

In [0]:
# Save enriched gold data to Delta in ADLS gold container

df.write.format("delta").mode("append").save(f"{gold_adls}EarthquakeData")