In [0]:
import requests
import json
from datetime import date, timedelta
from pyspark.sql.functions import to_timestamp

dates = [
    "2025-01-01", "2025-02-01", "2025-03-01", "2025-04-01", "2025-04-15"
]

for i in range(4):

    j= i + 1
    url = f"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={dates[i]}&endtime={dates[j]}"

    try:
        response = requests.get(url)
        response.raise_for_status()

        data = response.json().get("features",[])

        if not data:
            print("No data received")
        else:
            file_path = f"abfss://bronze@abychen.dfs.core.windows.net/historic_earthquake_data_{dates[i]}_to_{dates[j]}.json"
            json_data = json.dumps(data, indent=4)

            dbutils.fs.put(file_path, json_data, overwrite=True)
            #df = spark.createDataFrame([json_data])
            print(f"Data saved to {file_path}")
            #display(df)
            #print(json_data)
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}") 



In [0]:
from pyspark.sql.functions import when, col, to_timestamp, to_date, date_format, isnull
from pyspark.sql.types import TimestampType
i = 0
silver_file_path = "abfss://silver@abychen.dfs.core.windows.net/earthquake_events_silver"

for i in range(4):
    
    j= i + 1
    file_path = f"abfss://bronze@abychen.dfs.core.windows.net/historic_earthquake_data_{dates[i]}_to_{dates[j]}.json"
    df = spark.read.option("multiline", "true").json(file_path)
    
    df2 = df.select(
        df.id,
        df.geometry.coordinates[0].alias("longitude"),
        df.geometry.coordinates[1].alias("latitude"),
        df.geometry.coordinates[2].alias("elevation"),
        df.properties.title.alias("title"),
        df.properties.mag.alias("magnitude"),
        df.properties.place.alias("place_description"),
        df.properties.sig.alias("sig"),
        df.properties.magType.alias("magType"),
        df.properties.time.alias("time"),
        df.properties.updated.alias("updated")
    )
    df2 = df2.withColumn("longitude", when(isnull(col("longitude")), 0).otherwise(col("longitude")))\
                    .withColumn("latitude",when(isnull(col('latitude')),0).otherwise(col("latitude")))\
                        .withColumn("time",when(df2.time.isNull(),0).otherwise(df2.time))

    df2 = df2.withColumn("time",((df2.time)/1000).cast(TimestampType()))\
        .withColumn("updated",((df2.updated)/1000).cast(TimestampType()))

    df2 = df2.withColumn("event_date", to_date(to_timestamp(col("time"))))\
            .withColumn("event_time", date_format(to_timestamp(col("time")),"HH:mm:ss:SSS"))\
            .withColumn("updated_date", to_date(to_timestamp(col("updated"))))\
            .withColumn("updated_time", date_format(to_timestamp(col("updated")),"HH:mm:ss:SSS"))

    df2 = df2.drop("time","updated")
    df2.write.mode("append").format("delta").save(silver_file_path)
    

In [0]:
from pyspark.sql.functions import when, col, udf
import reverse_geocoder as rg
from pyspark.sql.types import StringType

silver_file_path = "abfss://silver@abychen.dfs.core.windows.net/earthquake_events_silver"


def get_country_code(lat,long):
    try:
        coordinates = (float(lat), float(long))
        result = rg.search(coordinates)[0].get('cc')
        print(f"Processed coordinates: {coordinates} -> {result}")
        return result
    except Exception as e:
        print(f"Error processing coordinates: {lat}, {long} -> {str(e)}")
        return None

get_country_code_udf = udf(get_country_code, StringType())

df3 = spark.read.format("delta").load(silver_file_path)

df3 = df3.withColumn("Country",get_country_code_udf(col("latitude"),col("longitude")))

gold_adls = "abfss://gold@abychen.dfs.core.windows.net/"
df3.write.format("delta").mode("append").save(f"{gold_adls}EarthquakeData")

In [0]:
display(df3.sort(df3.event_date.desc()).limit(10))

In [0]:
gold_adls = "abfss://gold@abychen.dfs.core.windows.net/"
df3.write.format("delta").mode("append").save(f"{gold_adls}EarthquakeData")