In [0]:
from datetime import date, timedelta 

start_date = date.today() - timedelta(1)

bronze_adls = "abfss://bronze@earthquendtoend.dfs.core.windows.net/"
silver_adls = "abfss://silver@earthquendtoend.dfs.core.windows.net/"

In [0]:
from pyspark.sql.functions import col, isnull, when
from pyspark.sql.types import TimestampType
from datetime import date, timedelta

In [0]:
df = spark.read.option("multiline", "true").json(f"{bronze_adls}{start_date}_earthquake_data.json")

In [0]:
df.show(10)

+--------------------+-----------+--------------------+-------+
|            geometry|         id|          properties|   type|
+--------------------+-----------+--------------------+-------+
|{[-122.7779998779...| nc75112231|{NULL, NULL, 7511...|Feature|
|{[-116.447, 33.40...| ci40832719|{NULL, NULL, 4083...|Feature|
|{[-104.41, 31.672...| tx2025adqo|{NULL, NULL, 2025...|Feature|
|{[-122.7965011596...| nc75112226|{NULL, NULL, 7511...|Feature|
|{[-115.641, 32.77...| ci40832703|{NULL, NULL, 4083...|Feature|
|{[-119.0397, 39.1...| nn00891271|{NULL, NULL, 0089...|Feature|
|{[-113.9913333333...| mb90070088|{NULL, NULL, 9007...|Feature|
|{[-117.2013333, 3...| ci40832687|{NULL, NULL, 4083...|Feature|
|{[-149.6224, 61.5...|ak0253otega|{NULL, NULL, 0253...|Feature|
|{[-119.0681, 39.1...| nn00891268|{NULL, NULL, 0089...|Feature|
+--------------------+-----------+--------------------+-------+
only showing top 10 rows



In [0]:
df.head(1)

[Row(geometry=Row(coordinates=[-122.77799987793, 38.7946662902832, 2.32999992370605], type='Point'), id='nc75112231', properties=Row(alert=None, cdi=None, code='75112231', detail='https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=nc75112231&format=geojson', dmin=0.007892, felt=None, gap=94.0, ids=',nc75112231,', mag=0.71, magType='md', mmi=None, net='nc', nst=8, place='3 km NNW of The Geysers, CA', rms=0.02, sig=8, sources=',nc,', status='automatic', time=1735861788680, title='M 0.7 - 3 km NNW of The Geysers, CA', tsunami=0, type='earthquake', types=',nearby-cities,origin,phase-data,', tz=None, updated=1735861884219, url='https://earthquake.usgs.gov/earthquakes/eventpage/nc75112231'), type='Feature')]

In [0]:
df_1 = (df.select('geometry.coordinates', 'id', 'properties.mag', 'type'))

In [0]:
df_1.show(10)

+--------------------+-----------+-----+-------+
|         coordinates|         id|  mag|   type|
+--------------------+-----------+-----+-------+
|[-122.77799987793...| nc75112231| 0.71|Feature|
|[-116.447, 33.407...| ci40832719| 1.19|Feature|
|[-104.41, 31.672,...| tx2025adqo|  1.3|Feature|
|[-122.79650115966...| nc75112226| 1.04|Feature|
|[-115.641, 32.775...| ci40832703|  1.3|Feature|
|[-119.0397, 39.10...| nn00891271|  1.9|Feature|
|[-113.99133333333...| mb90070088|-0.12|Feature|
|[-117.2013333, 33...| ci40832687| 1.15|Feature|
|[-149.6224, 61.55...|ak0253otega|  1.4|Feature|
|[-119.0681, 39.15...| nn00891268|  1.5|Feature|
+--------------------+-----------+-----+-------+
only showing top 10 rows



In [0]:
df = df.select(
    "id",
    col("geometry.coordinates").getItem(0).alias("longitude"),
    col("geometry.coordinates").getItem(1).alias("latitude"),
    col("geometry.coordinates").getItem(2).alias("elevation"),
    col("properties.title").alias("title"),
    col("properties.place").alias("place_description"),
    col("properties.mag").alias("magnitude"),
    col("properties.magtype").alias("magtype"),
    col("properties.sig").alias("sig"),
    col("properties.time").alias("time"),
    col("properties.updated").alias("updated"),

)

In [0]:
df.show(10)

+-----------+-----------------+----------------+----------------+--------------------+--------------------+---------+-------+---+-------------+-------------+
|         id|        longitude|        latitude|       elevation|               title|   place_description|magnitude|magtype|sig|         time|      updated|
+-----------+-----------------+----------------+----------------+--------------------+--------------------+---------+-------+---+-------------+-------------+
| nc75112231| -122.77799987793|38.7946662902832|2.32999992370605|M 0.7 - 3 km NNW ...|3 km NNW of The G...|     0.71|     md|  8|1735861788680|1735861884219|
| ci40832719|         -116.447|         33.4075|            13.5|M 1.2 - 18 km NNW...|18 km NNW of Borr...|     1.19|     ml| 22|1735861781400|1735862422970|
| tx2025adqo|          -104.41|          31.672|          6.4331|M 1.3 - 55 km S o...|55 km S of Whites...|      1.3|     ml| 26|1735861614507|1735919031283|
| nc75112226|-122.796501159668|38.8325004577637|    

In [0]:
df = ( df
      .withColumn("longitude", when(isnull(col("longitude")), 0).otherwise(col("longitude")))
      .withColumn("latitude", when(isnull(col("latitude")), 0).otherwise(col("latitude")))
      .withColumn("time", when(isnull(col("time")), 0).otherwise(col("time")))
)

In [0]:
df = (df
      .withColumn("time", (col("time")/1000).cast(TimestampType()))
      .withColumn("updated", (col("updated")/1000).cast(TimestampType()))
)

In [0]:
df.show(10)

+-----------+-----------------+----------------+----------------+--------------------+--------------------+---------+-------+---+--------------------+--------------------+
|         id|        longitude|        latitude|       elevation|               title|   place_description|magnitude|magtype|sig|                time|             updated|
+-----------+-----------------+----------------+----------------+--------------------+--------------------+---------+-------+---+--------------------+--------------------+
| nc75112231| -122.77799987793|38.7946662902832|2.32999992370605|M 0.7 - 3 km NNW ...|3 km NNW of The G...|     0.71|     md|  8|2025-01-02 23:49:...|2025-01-02 23:51:...|
| ci40832719|         -116.447|         33.4075|            13.5|M 1.2 - 18 km NNW...|18 km NNW of Borr...|     1.19|     ml| 22|2025-01-02 23:49:...|2025-01-03 00:00:...|
| tx2025adqo|          -104.41|          31.672|          6.4331|M 1.3 - 55 km S o...|55 km S of Whites...|      1.3|     ml| 26|2025-01-02 

In [0]:
#store the data frame into silver container
silvel_output_path = f"{silver_adls}earthquake_events_silver/"

In [0]:
# We are append the df in to the parquet file, every date the new data will be appended to the parquet file.
df.write.mode("append").parquet(silvel_output_path)