In [0]:
spark

khởi tạo Delta Spark

In [0]:
from pyspark.sql import SparkSession
from delta import *

builder = SparkSession.builder.appName("WeatherPreprocessing") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


1. Đọc dữ liệu từ Delta Lake ở tầng Silver

In [0]:
silver_path = "dbfs:/FileStore/User/honghai/delta/silver/merged_weather/"

df= spark.read.format("delta").load(silver_path)



In [0]:
df.show(10)

+-----+-----+----+-----------+---------+----+---------+----+-----+--------+-------------+--------+--------------------+
| time|month|year|temperature|feelslike|wind|direction|gust|cloud|humidity|precipitation|pressure|             weather|
+-----+-----+----+-----------+---------+----+---------+----+-----+--------+-------------+--------+--------------------+
|00:00|    1|2017|       24.0|     28.0| 8.0|      ENE|12.0|  4.0|    86.0|          0.0|  1012.0|               Clear|
|03:00|    1|2017|       23.0|     27.0| 8.0|       NE|10.0|  4.0|    88.0|          0.0|  1011.0|               Clear|
|06:00|    1|2017|       23.0|     26.0| 8.0|      NNE|11.0|  7.0|    85.0|          0.0|  1012.0|               Sunny|
|09:00|    1|2017|       28.0|     33.0|11.0|      NNE|13.0|  6.0|    64.0|          0.0|  1012.0|               Sunny|
|12:00|    1|2017|       31.0|     35.0|10.0|      ENE|12.0| 62.0|    53.0|          0.0|  1010.0|       Partly cloudy|
|15:00|    1|2017|       31.0|     36.0|

2.Tiền xử lý dữ liệu (Rain, after_3_hour, label)

Gán nhãn mưa (Rain)

In [0]:
from pyspark.sql.functions import col, when

df = df.withColumn("Rain", when(
    (col("humidity") > 70) &
    (col("cloud") > 41) &
    (col("pressure") < 1014) &
    (col("temperature") >= 10) &
    (col("temperature") <= 35), 1).otherwise(0))


3. Tạo đặc trưng trễ (lag features) sau 3 giờ

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.sql.functions import lead

# Tạo cột index tạm để giữ nguyên thứ tự dòng
df = df.withColumn("index", monotonically_increasing_id())

# Tạo window dựa trên index thay vì time
windowSpec = Window.orderBy("index")

# Dịch dữ liệu (lead) sau 3 giờ# Dịch giá trị sau 1 dòng
df = df.withColumn("rain_after_3_hour", lead("rain", 1).over(windowSpec))

# Các cột cần tạo bản sao trễ
cols = ["temperature", "feelslike", "wind", "direction",
        "gust", "cloud", "humidity", "precipitation",
        "pressure", "weather"]

for c in cols:
    df = df.withColumn(f"{c}_after_3_hour", lead(c, 1).over(windowSpec))

Gán label

In [0]:
df = df.withColumn("label", col("weather_after_3_hour"))


In [0]:
# Loại bỏ dòng bị null do dịch sau
df = df.dropna(subset=["label"])


In [0]:
# (tuỳ chọn) Xoá cột index tạm
df = df.drop("index")

In [0]:
df.display(5)

time,month,year,temperature,feelslike,wind,direction,gust,cloud,humidity,precipitation,pressure,weather,Rain,rain_after_3_hour,temperature_after_3_hour,feelslike_after_3_hour,wind_after_3_hour,direction_after_3_hour,gust_after_3_hour,cloud_after_3_hour,humidity_after_3_hour,precipitation_after_3_hour,pressure_after_3_hour,weather_after_3_hour,label
00:00,1,2017,24.0,28.0,8.0,ENE,12.0,4.0,86.0,0.0,1012.0,Clear,0,0,23.0,27.0,8.0,NE,10.0,4.0,88.0,0.0,1011.0,Clear,Clear
03:00,1,2017,23.0,27.0,8.0,NE,10.0,4.0,88.0,0.0,1011.0,Clear,0,0,23.0,26.0,8.0,NNE,11.0,7.0,85.0,0.0,1012.0,Sunny,Sunny
06:00,1,2017,23.0,26.0,8.0,NNE,11.0,7.0,85.0,0.0,1012.0,Sunny,0,0,28.0,33.0,11.0,NNE,13.0,6.0,64.0,0.0,1012.0,Sunny,Sunny
09:00,1,2017,28.0,33.0,11.0,NNE,13.0,6.0,64.0,0.0,1012.0,Sunny,0,0,31.0,35.0,10.0,ENE,12.0,62.0,53.0,0.0,1010.0,Partly cloudy,Partly cloudy
12:00,1,2017,31.0,35.0,10.0,ENE,12.0,62.0,53.0,0.0,1010.0,Partly cloudy,0,0,31.0,36.0,1.0,NNE,1.0,79.0,57.0,0.0,1009.0,Cloudy,Cloudy
15:00,1,2017,31.0,36.0,1.0,NNE,1.0,79.0,57.0,0.0,1009.0,Cloudy,0,0,26.0,29.0,5.0,ESE,10.0,40.0,77.0,0.0,1010.0,Partly cloudy,Partly cloudy
18:00,1,2017,26.0,29.0,5.0,ESE,10.0,40.0,77.0,0.0,1010.0,Partly cloudy,0,0,25.0,28.0,8.0,NE,15.0,29.0,79.0,0.2,1011.0,Patchy rain possible,Patchy rain possible
21:00,1,2017,25.0,28.0,8.0,NE,15.0,29.0,79.0,0.2,1011.0,Patchy rain possible,0,0,24.0,26.0,10.0,NNE,17.0,12.0,84.0,0.0,1011.0,Clear,Clear
00:00,1,2017,24.0,26.0,10.0,NNE,17.0,12.0,84.0,0.0,1011.0,Clear,0,0,23.0,26.0,8.0,NE,14.0,17.0,85.0,0.0,1010.0,Clear,Clear
03:00,1,2017,23.0,26.0,8.0,NE,14.0,17.0,85.0,0.0,1010.0,Clear,0,0,24.0,26.0,10.0,NE,14.0,16.0,80.0,0.0,1011.0,Clear,Clear


4. Ghi dữ liệu ra Delta Lake tầng GOLD

In [0]:
# Lưu dữ liệu đã xử lý ra Delta Lake
gold_path = "dbfs:/FileStore/User/honghai/delta/gold/weather_features"
df.write.format("delta") \
    .option("overwriteSchema", "true") \
    .mode("overwrite") \
    .save(gold_path)

print("✅ Đặc trưng thời tiết đã được lưu vào GOLD layer thành công.")


✅ Đặc trưng thời tiết đã được lưu vào GOLD layer thành công.


Đọc lại để kiểm tra

In [0]:
df_loaded = spark.read.format("delta").load("dbfs:/Users/maihonghai2004@gmail.com/DBFS/delta_table")
df_loaded.select("time", "temperature", "temperature_after_3_hour", "Rain", "label").show(10)


+-----+-----------+------------------------+----+--------------------+
| time|temperature|temperature_after_3_hour|Rain|               label|
+-----+-----------+------------------------+----+--------------------+
|00:00|       24.0|                    23.0|   0|               Clear|
|03:00|       23.0|                    23.0|   0|               Sunny|
|06:00|       23.0|                    28.0|   0|               Sunny|
|09:00|       28.0|                    31.0|   0|       Partly cloudy|
|12:00|       31.0|                    31.0|   0|              Cloudy|
|15:00|       31.0|                    26.0|   0|       Partly cloudy|
|18:00|       26.0|                    25.0|   0|Patchy rain possible|
|21:00|       25.0|                    24.0|   0|               Clear|
|00:00|       24.0|                    23.0|   0|               Clear|
|03:00|       23.0|                    24.0|   0|               Clear|
+-----+-----------+------------------------+----+--------------------+
only s