In [0]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import lit

# Kh·ªüi t·∫°o SparkSession v·ªõi t·∫•t c·∫£ c·∫•u h√¨nh c·∫ßn thi·∫øt
builder = SparkSession.builder \
    .appName("WeatherPrediction") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g")

# √Åp d·ª•ng c·∫•u h√¨nh Delta v√† kh·ªüi t·∫°o SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [0]:
spark.sparkContext.setLogLevel("ERROR")

In [0]:
# ƒê∆∞·ªùng d·∫´n ƒë·∫øn HDFS tr√™n localhost:9000
delta_table_path = "dbfs:/minhhieu/delta/gold/weather_features"


# ƒê·ªçc l·∫°i d·ªØ li·ªáu t·ª´ Delta Table
df = spark.read.format("delta").load(delta_table_path)

from pyspark.sql.functions import col


num_rows = df.count()

df = df.limit(num_rows - 1)

df.show()

+-----+-----+----+-----------+---------+----+---------+----+-----+--------+-------------+--------+--------------------+----+------------------------+----------------------+-----------------+----------------------+-----------------+------------------+---------------------+--------------------------+---------------------+--------------------+--------------------+-----------------+
| time|month|year|temperature|feelslike|wind|direction|gust|cloud|humidity|precipitation|pressure|             weather|Rain|temperature_after_3_hour|feelslike_after_3_hour|wind_after_3_hour|direction_after_3_hour|gust_after_3_hour|cloud_after_3_hour|humidity_after_3_hour|precipitation_after_3_hour|pressure_after_3_hour|weather_after_3_hour|               label|rain_after_3_hour|
+-----+-----+----+-----------+---------+----+---------+----+-----+--------+-------------+--------+--------------------+----+------------------------+----------------------+-----------------+----------------------+-----------------+-----

In [0]:
# ƒê·∫øm s·ªë l∆∞·ª£ng c·ªôt
num_columns = len(df.columns)
print(f"S·ªë l∆∞·ª£ng c·ªôt: {num_columns}")

# ƒê·∫øm s·ªë l∆∞·ª£ng d√≤ng
num_rows = df.count()
print(f"S·ªë l∆∞·ª£ng d√≤ng: {num_rows}")


S·ªë l∆∞·ª£ng c·ªôt: 26
S·ªë l∆∞·ª£ng d√≤ng: 8510


In [0]:
from pyspark.ml.feature import StringIndexer

# T·∫°o b·ªô chuy·ªÉn ƒë·ªïi cho c√°c c·ªôt
indexers = [
    StringIndexer(inputCol="direction", outputCol="direction_index"),
    StringIndexer(inputCol="weather", outputCol="weather_index"),
    StringIndexer(inputCol="direction_after_3_hour", outputCol="direction_after_index"),
    StringIndexer(inputCol="weather_after_3_hour", outputCol="weather_after_index"),
]

# √Åp d·ª•ng pipeline ƒë·ªÉ chuy·ªÉn ƒë·ªïi t·∫•t c·∫£ c√°c c·ªôt c√πng l√∫c
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)

# Tr√≠ch xu·∫•t √°nh x·∫° th√†nh dictionary
direction_mapping = df.select("direction", "direction_index").distinct().collect()
weather_mapping = df.select("weather", "weather_index").distinct().collect()
direction_after_mapping = df.select("direction_after_3_hour", "direction_after_index").distinct().collect()
weather_after_mapping = df.select("weather_after_3_hour", "weather_after_index").distinct().collect()

# L∆∞u v√†o dictionary
direction_dict = {row["direction"]: int(row["direction_index"]) for row in direction_mapping}
weather_dict = {row["weather"]: int(row["weather_index"]) for row in weather_mapping}
direction_after_dict = {row["direction_after_3_hour"]: int(row["direction_after_index"]) for row in direction_after_mapping}
weather_after_dict = {row["weather_after_3_hour"]: int(row["weather_after_index"]) for row in weather_after_mapping}

# In k·∫øt qu·∫£
print("Mapping Direction:", direction_dict)
print("Mapping Weather:", weather_dict)
print("Mapping Direction After 3 Hours:", direction_after_dict)
print("Mapping Weather After 3 Hours:", weather_after_dict)


Mapping Direction: {'ENE': 8, 'NE': 10, 'NNE': 9, 'ESE': 3, 'E': 5, 'SE': 0, 'SSE': 4, 'NNW': 14, 'WSW': 1, 'S': 11, 'WNW': 12, 'NW': 15, 'W': 7, 'SW': 2, 'SSW': 6, 'N': 13}
Mapping Weather: {'Clear': 1, 'Sunny': 2, 'Partly cloudy': 0, 'Cloudy': 5, 'Patchy rain possible': 3, 'Light rain shower': 6, 'Overcast': 7, 'Moderate or heavy rain shower': 4, 'Patchy light drizzle': 12, 'Torrential rain shower': 9, 'Light drizzle': 14, 'Patchy light rain': 10, 'Thundery outbreaks possible': 8, 'Light rain': 13, 'Patchy light rain with thunder': 11, 'Moderate rain': 17, 'Moderate rain at times': 16, 'Mist': 15, 'Heavy rain at times': 19, 'Heavy rain': 18}
Mapping Direction After 3 Hours: {'NE': 10, 'NNE': 9, 'ENE': 8, 'ESE': 3, 'E': 5, 'SE': 0, 'SSE': 4, 'NNW': 14, 'WSW': 1, 'S': 11, 'WNW': 12, 'NW': 15, 'W': 7, 'SW': 2, 'SSW': 6, 'N': 13}
Mapping Weather After 3 Hours: {'Clear': 1, 'Sunny': 2, 'Partly cloudy': 0, 'Cloudy': 5, 'Patchy rain possible': 3, 'Light rain shower': 6, 'Overcast': 7, 'Mode

In [0]:
# Thay th·∫ø c·ªôt chu·ªói b·∫±ng c·ªôt ƒë√£ m√£ h√≥a
df = df \
    .drop("direction", "weather", "direction_after_3_hour", "weather_after_3_hour") \
    .withColumnRenamed("direction_index", "direction") \
    .withColumnRenamed("weather_index", "weather") \
    .withColumnRenamed("direction_after_index", "direction_after_3_hour") \
    .withColumnRenamed("weather_after_index", "weather_after_3_hour")


In [0]:
from pyspark.sql.functions import regexp_replace

# S·ª≠a c·ªôt 'time', lo·∫°i b·ªè ':00'
df = df.withColumn("time", regexp_replace("time", ":00", ""))


In [0]:
from pyspark.sql.functions import col
from pyspark.sql import DataFrame

def cast_columns_to_float(df: DataFrame, columns: list) -> DataFrame:
    for c in columns:
        df = df.withColumn(c, col(c).cast("float"))
    return df


In [0]:
from pyspark.sql.functions import sin, cos, col
import math

# Chuy·ªÉn ƒë·ªïi c·ªôt 'time' (gi·ªù trong ng√†y) th√†nh sin/cos
df = df.withColumn("time_sin", sin(2 * math.pi * col("time") / 24))
df = df.withColumn("time_cos", cos(2 * math.pi * col("time") / 24))

# Chuy·ªÉn ƒë·ªïi c·ªôt 'month' (th√°ng trong nƒÉm) th√†nh sin/cos
df = df.withColumn("month_sin", sin(2 * math.pi * col("month") / 12))
df = df.withColumn("month_cos", cos(2 * math.pi * col("month") / 12))


In [0]:
input_cols = [
    'time', 'month', 'temperature', 'feelslike', 'wind',
    'direction', 'gust', 'cloud', 'humidity', 'precipitation',
    'pressure',
    'time_sin', 'time_cos',  # Bi·ªÉu di·ªÖn th·ªùi gian trong ng√†y
    'month_sin', 'month_cos'  # Bi·ªÉu di·ªÖn th√°ng trong nƒÉm
]

output_cols = [
    'temperature_after_3_hour',
    'feelslike_after_3_hour',
    'wind_after_3_hour',
    'direction_after_3_hour',
    'gust_after_3_hour',
    'cloud_after_3_hour',
    'humidity_after_3_hour',
    'precipitation_after_3_hour',
    'pressure_after_3_hour'
]


In [0]:
df = cast_columns_to_float(df, input_cols + output_cols)

In [0]:
df.printSchema()


root
 |-- time: float (nullable = true)
 |-- month: float (nullable = true)
 |-- year: string (nullable = true)
 |-- temperature: float (nullable = true)
 |-- feelslike: float (nullable = true)
 |-- wind: float (nullable = true)
 |-- gust: float (nullable = true)
 |-- cloud: float (nullable = true)
 |-- humidity: float (nullable = true)
 |-- precipitation: float (nullable = true)
 |-- pressure: float (nullable = true)
 |-- Rain: integer (nullable = true)
 |-- temperature_after_3_hour: float (nullable = true)
 |-- feelslike_after_3_hour: float (nullable = true)
 |-- wind_after_3_hour: float (nullable = true)
 |-- gust_after_3_hour: float (nullable = true)
 |-- cloud_after_3_hour: float (nullable = true)
 |-- humidity_after_3_hour: float (nullable = true)
 |-- precipitation_after_3_hour: float (nullable = true)
 |-- pressure_after_3_hour: float (nullable = true)
 |-- label: string (nullable = true)
 |-- rain_after_3_hour: integer (nullable = true)
 |-- direction: float (nullable = false)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
df = assembler.transform(df)


In [0]:
# Th∆∞ vi·ªán c·∫ßn thi·∫øt
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

# L·∫∑p qua t·ª´ng c·ªôt ƒë·∫ßu ra (output c·∫ßn d·ª± ƒëo√°n) ‚Äì ngo·∫°i tr·ª´ 'weather_after_3_hour' v√¨ ƒë√¢y l√† ph√¢n lo·∫°i
for target in output_cols:  # B·ªè weather_after_3_hour (classification)
    # Lo·∫°i b·ªè c√°c d√≤ng ch·ª©a gi√° tr·ªã null trong c·ªôt ƒë·∫∑c tr∆∞ng (input) v√† nh√£n m·ª•c ti√™u (target)
    current_df = df.dropna(subset=input_cols + [target])

    # N·∫øu sau khi l·ªçc kh√¥ng c√≤n d√≤ng n√†o ‚Üí b·ªè qua m√¥ h√¨nh n√†y
    if current_df.count() == 0:
        print(f"‚ö†Ô∏è B·ªè qua {target} v√¨ kh√¥ng c√≤n d√≤ng sau khi dropna.")
        continue

    print(f"üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: {target}")

    # T·∫°o c·ªôt s·ªë th·ª© t·ª± d√≤ng ƒë·ªÉ chia d·ªØ li·ªáu theo th·ªùi gian (gi·∫£ l·∫≠p th·ªùi gian b·∫±ng th·ª© t·ª±)
    window = Window.orderBy(monotonically_increasing_id())
    indexed_df = current_df.withColumn("row_num", row_number().over(window))

    # Chia t·∫≠p train/test theo th·ª© t·ª± d√≤ng (80% hu·∫•n luy·ªán, 20% ki·ªÉm th·ª≠)
    total_rows = indexed_df.count()
    split_point = int(total_rows * 0.8)

    train_data = indexed_df.filter(col("row_num") <= split_point).drop("row_num")
    test_data = indexed_df.filter(col("row_num") > split_point).drop("row_num")

    # Kh·ªüi t·∫°o v√† hu·∫•n luy·ªán m√¥ h√¨nh Random Forest h·ªìi quy
    model = RandomForestRegressor(
        featuresCol="features",  # C·ªôt ƒë·∫∑c tr∆∞ng ƒë·∫ßu v√†o
        labelCol=target,         # C·ªôt ƒë·∫ßu ra c·∫ßn d·ª± ƒëo√°n
        numTrees=100             # S·ªë c√¢y trong r·ª´ng
    )
    model_fitted = model.fit(train_data)

    # D·ª± ƒëo√°n tr√™n t·∫≠p ki·ªÉm th·ª≠ v√† ƒë√°nh gi√° b·∫±ng ch·ªâ s·ªë RMSE
    predictions = model_fitted.transform(test_data)
    evaluator = RegressionEvaluator(
        labelCol=target,
        predictionCol="prediction",
        metricName="rmse"       # Root Mean Squared Error
    )
    rmse = evaluator.evaluate(predictions)

    print(f"‚úÖ RMSE cho {target}: {rmse:.3f}")

    # L∆∞u m√¥ h√¨nh ƒë√£ hu·∫•n luy·ªán v√†o th∆∞ m·ª•c tr√™n DBFS (Databricks File System)
    model_fitted.write().overwrite().save(f"dbfs:/minhhieu/delta/models/{target}_model")


üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: temperature_after_3_hour
‚úÖ RMSE cho temperature_after_3_hour: 1.877
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: feelslike_after_3_hour
‚úÖ RMSE cho feelslike_after_3_hour: 1.613
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: wind_after_3_hour
‚úÖ RMSE cho wind_after_3_hour: 2.428
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: direction_after_3_hour
‚úÖ RMSE cho direction_after_3_hour: 3.391
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: gust_after_3_hour
‚úÖ RMSE cho gust_after_3_hour: 4.306
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: cloud_after_3_hour
‚úÖ RMSE cho cloud_after_3_hour: 22.324
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: humidity_after_3_hour
‚úÖ RMSE cho humidity_after_3_hour: 4.184
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: precipitation_after_3_hour
‚úÖ RMSE cho precipitation_after_3_hour: 1.972
üéØ ƒêang hu·∫•n luy·ªán m√¥ h√¨nh cho: pressure_after_3_hour
‚úÖ RMSE cho pressure_after_3_hour: 0.904


In [0]:
# Th∆∞ vi·ªán c·∫ßn thi·∫øt t·ª´ PySpark ML
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Chia d·ªØ li·ªáu th√†nh t·∫≠p hu·∫•n luy·ªán v√† ki·ªÉm th·ª≠ (80% train, 20% test)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Kh·ªüi t·∫°o m√¥ h√¨nh Random Forest Classifier
clf = RandomForestClassifier(
    featuresCol="features",                    # C·ªôt ch·ª©a vector ƒë·∫∑c tr∆∞ng
    labelCol="weather_after_3_hour",           # C·ªôt nh√£n ph√¢n lo·∫°i ƒë·∫ßu ra
    numTrees=200,                              # S·ªë c√¢y trong r·ª´ng
    maxDepth=10                                # ƒê·ªô s√¢u t·ªëi ƒëa c·ªßa m·ªói c√¢y
)

# Hu·∫•n luy·ªán m√¥ h√¨nh v·ªõi t·∫≠p train
model = clf.fit(train_data)

# D·ª± ƒëo√°n tr√™n t·∫≠p test
predictions = model.transform(test_data)

# Kh·ªüi t·∫°o b·ªô ƒë√°nh gi√° Accuracy v√† F1-score
acc_evaluator = MulticlassClassificationEvaluator(
    labelCol="weather_after_3_hour",
    predictionCol="prediction",
    metricName="accuracy"                     # T√≠nh ƒë·ªô ch√≠nh x√°c
)

f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="weather_after_3_hour",
    predictionCol="prediction",
    metricName="f1"                           # T√≠nh ch·ªâ s·ªë F1 (harmonic mean gi·ªØa precision & recall)
)

# T√≠nh to√°n ƒë·ªô ch√≠nh x√°c v√† F1-score tr√™n t·∫≠p ki·ªÉm th·ª≠
accuracy = acc_evaluator.evaluate(predictions)
f1_score = f1_evaluator.evaluate(predictions)

# In k·∫øt qu·∫£ ra m√†n h√¨nh
print(f"‚úÖ Accuracy for weather_after_3_hour: {accuracy:.3f}")
print(f"‚úÖ F1-score for weather_after_3_hour: {f1_score:.3f}")

# L∆∞u m√¥ h√¨nh ƒë√£ hu·∫•n luy·ªán v√†o h·ªá th·ªëng t·ªáp (DBFS ‚Äì Databricks File System)
model.write().overwrite().save("dbfs:/minhhieu/delta/models/weather_classifier_model")


‚úÖ Accuracy for weather_after_3_hour: 0.583
‚úÖ F1-score for weather_after_3_hour: 0.535


In [0]:
df.select("weather_after_3_hour").distinct().show()


+--------------------+
|weather_after_3_hour|
+--------------------+
|                 1.0|
|                 2.0|
|                 0.0|
|                 5.0|
|                 3.0|
|                 6.0|
|                 7.0|
|                 4.0|
|                12.0|
|                 9.0|
|                14.0|
|                10.0|
|                 8.0|
|                13.0|
|                11.0|
|                17.0|
|                16.0|
|                15.0|
|                19.0|
|                18.0|
+--------------------+



In [0]:
from pyspark.sql.functions import when

# G√°n nh√£n: c√°c lo·∫°i th·ªùi ti·∫øt li√™n quan ƒë·∫øn m∆∞a ‚Üí 1, c√≤n l·∫°i ‚Üí 0
rain_labels = [3, 4, 6, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19]  # c√°c gi√° tr·ªã mapping t∆∞∆°ng ·ª©ng v·ªõi m∆∞a

df = df.withColumn("label_after_3_hour", when(col("weather_after_3_hour").isin(rain_labels), 1).otherwise(0))

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Train-test split
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# RandomForestClassifier
clf = RandomForestClassifier(featuresCol="features", labelCol="label_after_3_hour", numTrees=200, maxDepth=10)

model = clf.fit(train_data)

# D·ª± ƒëo√°n
predictions = model.transform(test_data)

# ƒê√°nh gi√°
acc_evaluator = MulticlassClassificationEvaluator(labelCol="label_after_3_hour", predictionCol="prediction", metricName="accuracy")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label_after_3_hour", predictionCol="prediction", metricName="f1")

accuracy = acc_evaluator.evaluate(predictions)
f1_score = f1_evaluator.evaluate(predictions)

print(f"‚úÖ Accuracy for weather_after_3_hour: {accuracy:.3f}")
print(f"‚úÖ F1-score for weather_after_3_hour: {f1_score:.3f}")

model.write().overwrite().save("dbfs:/minhhieu/delta/models/weather_classifier_model_2")


‚úÖ Accuracy for weather_after_3_hour: 0.876
‚úÖ F1-score for weather_after_3_hour: 0.870
