In [0]:
from pyspark.sql.functions import (
    col, to_date, hour, dayofweek, month, year, current_timestamp,
    when, concat_ws, split, round, trim
)

# 1. Citește BRONZE
df_bronze = spark.read.format("delta").load("/mnt/data/earthquakes/bronze/")

# 2. Elimină duplicate și înregistrări invalide
df_new = df_bronze.dropDuplicates(["id"]).filter(col("mag").isNotNull())

# 3. Rotunjim adâncimea la 5 zecimale
df_new = df_new.withColumn("depth_km", round(col("depth_km"), 5))

# 4. Citește silver existent și excludem ID-urile deja scrise
try:
    df_existing = spark.read.format("delta").load("/mnt/data/earthquakes/silver/")
    df_new = df_new.join(df_existing.select("id"), on="id", how="left_anti")
except:
    print("Zona silver nu există încă — scriem toate datele.")

# 5. Feature Engineering

# Temporal
df_new = df_new.withColumn("date", to_date("time")) \
               .withColumn("hour", hour("time")) \
               .withColumn("weekday", dayofweek("time")) \
               .withColumn("month", month("time")) \
               .withColumn("year", year("time")) \
               .withColumn("is_night", when((col("hour") < 6) | (col("hour") >= 22), 1).otherwise(0)) \
               .withColumn("day_period", when(col("hour").between(6, 11), "morning")
                                         .when(col("hour").between(12, 17), "afternoon")
                                         .when(col("hour").between(18, 21), "evening")
                                         .otherwise("night"))

# Spațial (inclusiv redenumire corectă)
df_new = df_new.withColumn(
    "location_key_latlong",
    concat_ws(" / ", col("latitude").cast("string"), col("longitude").cast("string"))
)


df_new = df_new.withColumn(
    "region_country",
    when(
        col("place").isNotNull() & col("place").contains(","),
        trim(split(col("place"), ",").getItem(1))
    ).otherwise("Unknown")
)

# Seismic
df_new = df_new.withColumn("depth_category",
                 when(col("depth_km") <= 70, "shallow")
                .when(col("depth_km").between(71, 300), "intermediate")
                .otherwise("deep")) \
               .withColumn("has_tsunami", when(col("tsunami") == 1, 1).otherwise(0)) \
               .withColumn("has_felt", when(col("felt") >= 1, 1).otherwise(0))

# Audit
df_new = df_new.withColumn("processed_at", current_timestamp())

# 6. Scriere în zona SILVER (append pentru rulări automate)
df_new.write.format("delta") \
    .option("MergeSchema", "true") \
    .mode("append") \
    .save("/mnt/data/earthquakes/silver/")


In [0]:
# Citește datele din zona silver
df_silver = spark.read.format("delta").load("/mnt/data/earthquakes/silver/")

# Afișează primele rânduri
display(df_silver)
df_silver.printSchema()


id,place,time,mag,type,status,tsunami,felt,latitude,longitude,depth,ingest_time,date,hour,weekday,month,year,is_night,day_period,location_key,region_country,depth_category,has_tsunami,has_felt,processed_at,location_key_latlong
tx2025jrki,"58 km S of Whites City, New Mexico",2025-05-18T14:20:58.34Z,1.7,earthquake,automatic,0,,31.651,-104.458,8.8857,2025-05-21T06:14:18.474553Z,2025-05-18,14,1,5,2025,0,afternoon,31.651 / -104.458,New Mexico,shallow,0,0,2025-05-21T09:48:38.687Z,
pr2025138000,"16 km ENE of Santa Cruz de El Seibo, Dominican Republic",2025-05-18T12:07:49.39Z,4.05,earthquake,reviewed,0,,18.8161,-68.8965,164.0,2025-05-21T06:14:18.474611Z,2025-05-18,12,1,5,2025,0,afternoon,18.8161 / -68.8965,Dominican Republic,intermediate,0,0,2025-05-21T09:48:38.687Z,
nc75184082,"10 km NW of The Geysers, CA",2025-05-18T17:45:41.17Z,0.76,earthquake,automatic,0,,38.8411674499512,-122.838996887207,1.72,2025-05-21T06:14:18.474489Z,2025-05-18,17,1,5,2025,0,afternoon,38.8411674499512 / -122.838996887207,CA,shallow,0,0,2025-05-21T09:48:38.687Z,
ci41156152,"8 km NE of Coachella, CA",2025-05-19T09:44:22.37Z,1.01,earthquake,automatic,0,,33.731,-116.113,3.35,2025-05-21T06:14:18.474183Z,2025-05-19,9,2,5,2025,0,morning,33.731 / -116.113,CA,shallow,0,0,2025-05-21T09:48:38.687Z,
us7000q03c,"39 km ESE of Tomioka, Japan",2025-05-19T03:14:40.682Z,4.6,earthquake,reviewed,0,,37.2086,141.4316,10.0,2025-05-21T06:14:18.474305Z,2025-05-19,3,2,5,2025,1,night,37.2086 / 141.4316,Japan,shallow,0,0,2025-05-21T09:48:38.687Z,
nc75184167,"9 km NW of The Geysers, CA",2025-05-18T23:55:00.15Z,0.67,earthquake,automatic,0,,38.8188323974609,-122.84400177002,1.58,2025-05-21T06:14:18.474377Z,2025-05-18,23,1,5,2025,1,night,38.8188323974609 / -122.84400177002,CA,shallow,0,0,2025-05-21T09:48:38.687Z,
nc75184112,"1 km ENE of The Geysers, CA",2025-05-18T18:27:22.06Z,0.72,earthquake,automatic,0,,38.781665802002,-122.74333190918,0.76,2025-05-21T06:14:18.474466Z,2025-05-18,18,1,5,2025,0,evening,38.781665802002 / -122.74333190918,CA,shallow,0,0,2025-05-21T09:48:38.687Z,
ak0256ci0ag7,"88 km NW of Yakutat, Alaska",2025-05-18T16:53:47.594Z,1.5,earthquake,automatic,0,,60.0964,-140.8778,17.9,2025-05-21T06:14:18.474499Z,2025-05-18,16,1,5,2025,0,afternoon,60.0964 / -140.8778,Alaska,shallow,0,0,2025-05-21T09:48:38.687Z,
tx2025jsfi,"12 km NNE of Balmorhea, Texas",2025-05-19T00:58:22.104Z,1.9,earthquake,automatic,0,,31.094,-103.705,3.3526,2025-05-21T06:14:18.474347Z,2025-05-19,0,2,5,2025,1,night,31.094 / -103.705,Texas,shallow,0,0,2025-05-21T09:48:38.687Z,
nc75184262,"4 km NNW of The Geysers, CA",2025-05-19T06:18:16.1Z,0.99,earthquake,automatic,0,,38.8033332824707,-122.781997680664,-0.08,2025-05-21T06:14:18.474249Z,2025-05-19,6,2,5,2025,0,morning,38.8033332824707 / -122.781997680664,CA,shallow,0,0,2025-05-21T09:48:38.687Z,


root
 |-- id: string (nullable = true)
 |-- place: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- mag: double (nullable = true)
 |-- type: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tsunami: long (nullable = true)
 |-- felt: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- depth: double (nullable = true)
 |-- ingest_time: timestamp (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- is_night: integer (nullable = true)
 |-- day_period: string (nullable = true)
 |-- location_key: string (nullable = true)
 |-- region_country: string (nullable = true)
 |-- depth_category: string (nullable = true)
 |-- has_tsunami: integer (nullable = true)
 |-- has_felt: integer (nullable = true)
 |-- processed_at: timestamp (nullable = true)
 |