In [1]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from modules.metrics import rmse
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
print(gpus)

1 Physical GPUs, 1 Logical GPUs
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


- 輸入：t-6, t
- 輸出：t+6, t+12, t+18 ...

In [3]:
def fit_scaler_on_training_data(column_names, csv_path, test_month_prefixes=('202207', '202208', '202209', '202210', '202211', '202212'),
                                train_cases_to_exclude={'20210116', '20210530', '20210825', '20210722', '20220904'}):
    '''
    根據訓練資料計算特徵縮放器，對指定欄位進行標準化。

    參數:
        column_names: 要提取的欄位名稱列表，順序為 [lat_diff, lng_diff, max_dbz, u_motion, v_motion]
        csv_path: CSV 檔案所在的資料夾路徑
        test_month_prefixes: 要排除的測試資料月份前綴 (預設為 2022 年 7-12 月)

    回傳:
        sklearn 的 StandardScaler，根據訓練資料 fit 完成
    '''
    features = []

    for file_name in sorted(os.listdir(csv_path)):
        if file_name.startswith(test_month_prefixes) or any(file_name.startswith(case) for case in train_cases_to_exclude):
            continue  # 跳過測試與個案資料

        file_path = os.path.join(csv_path, file_name)
        df = pd.read_csv(file_path, encoding='utf-8',
                         dtype={'FileName': str, 'DateTime': str, 'Date': str, 'Time': str})

        df.replace(-99900, 0, inplace=True)

        # df.replace(-99900, np.nan, inplace=True)
        # df.dropna(subset=column_names, inplace=True)

        # 擷取對應欄位的數據並加入特徵列表
        try:
            row = df[column_names].values
            features.append(row)
        except KeyError as e:
            print(f'缺少欄位: {e}，檔案 {file_name} 跳過')
            continue
    
    if not features:
        raise ValueError("找不到任何符合條件的訓練資料，無法建立 scaler。")

    # 合併所有資料並進行縮放
    all_features = np.vstack(features)
    scaler = StandardScaler()
    scaler.fit(all_features)

    return scaler

In [4]:
def csv_sliding_window_generator_ffnn(csv_path, feature_columns, label_columns, feature_scaler, label_scaler, mode):
    '''
    預先處理CSV的滑動窗口，並將其生成。
    '''
    csv_files = sorted(os.listdir(csv_path))

    train_cases_to_exclude = {'20210116', '20210530', '20210825', '20210722', '20220904'}

    for csv_file in csv_files:
        if mode == 'train':
            # 剔除特定個案
            if any(csv_file.startswith(case) for case in train_cases_to_exclude):
                continue
            # 只保留 2021 年和 2022 年前六個月
            if not csv_file.startswith(('2021', '202201', '202202', '202203', '202204', '202205', '202206')):
                continue

        elif mode == 'val':
            # 驗證集為 2022 年 7~9 月
            if not csv_file.startswith(('202207', '202208', '202209')):
                continue

        else:  # test
            # 測試集為 2022 年 10~12 月
            if not csv_file.startswith(('202210', '202211', '202212')):
                continue


        csv_file_path = os.path.join(csv_path, csv_file)
        data = pd.read_csv(csv_file_path, encoding='utf-8',
                           dtype={'FileName': str, 'Day': str, 'Time': str, 'DateTime': str})

        data.replace(-99900, 0, inplace=True)

        # data.replace(-99900, np.nan, inplace=True)
        # data.dropna(subset=feature_columns, inplace=True)

        
        input_steps = 2
        output_steps = 10
        steps = input_steps + output_steps

        if len(data) < steps:
            continue

        # 標準化
        for i in range(len(data) - (steps - 1)):  # 滑動窗口範圍
            input_rows = data.iloc[i : i+input_steps]
            output_rows = data.iloc[i+input_steps : i+steps]  # t+6 ~ t+60，共 10 筆
            
            # 取輸入特徵
            x = input_rows[feature_columns].values # (2, n_features)
            x_scaled = feature_scaler.transform(x)  # 縮放
            x_scaled = x_scaled.flatten() # 拉平 (2, n_features) -> (14,)

            # 取輸出目標
            y = output_rows[label_columns].values  # shape: (10, 5)
            y_scaled = label_scaler.transform(y)   # 套用 scaler
            y_scaled = y_scaled.flatten()          # 拉平成 (50,)

            yield x_scaled.astype(np.float32), y_scaled.astype(np.float32)

def create_csv_sliding_window_dataset_ffnn(csv_path, feature_columns, label_columns, feature_scaler, label_scaler, mode):
    '''
    包裝成 tf.data.Dataset 格式。
    '''
    dataset = tf.data.Dataset.from_generator(
        lambda: csv_sliding_window_generator_ffnn(csv_path, feature_columns, label_columns, feature_scaler, label_scaler, mode),
        output_signature=(
            tf.TensorSpec(shape=(14,), dtype=tf.float32),  # X: ? steps × 7 features
            tf.TensorSpec(shape=(50,), dtype=tf.float32),  # Y: ? steps × 5 features
        )
    )
    return dataset

In [5]:
feature_columns = ['Latitude', 'Longitude', 'VIL', 'MaxdBZ', 'CellVolume', 'UMotion', 'VMotion']  # 欄位名稱
label_columns = ['Latitude', 'Longitude', 'VIL', 'MaxdBZ', 'CellVolume']  # 欄位名稱

# 經緯度 CSV 路徑
cells_csv_path = r'H:\cell_data_processed\cells'

# 創建經緯度滑動窗口數據集
# feature_scaler = fit_scaler_on_training_data(feature_columns, cells_csv_path)
# joblib.dump(feature_scaler, 'ffnn_feature_scaler.gz')  # 保存scaler
feature_scaler = joblib.load('ffnn_feature_scaler.gz')  # 加載scaler

# label_scaler = fit_scaler_on_training_data(label_columns, cells_csv_path)
# joblib.dump(label_scaler, 'ffnn_label_scaler.gz')  # 保存scaler
label_scaler = joblib.load('ffnn_label_scaler.gz')  # 加載scaler

# 創建csv數據集
train_dataset = create_csv_sliding_window_dataset_ffnn(
    cells_csv_path, feature_columns, label_columns, feature_scaler, label_scaler, mode='train')

val_dataset = create_csv_sliding_window_dataset_ffnn(
    cells_csv_path, feature_columns, label_columns, feature_scaler, label_scaler, mode='val')

In [6]:
print(train_dataset.element_spec)
print(val_dataset.element_spec)

(TensorSpec(shape=(14,), dtype=tf.float32, name=None), TensorSpec(shape=(50,), dtype=tf.float32, name=None))
(TensorSpec(shape=(14,), dtype=tf.float32, name=None), TensorSpec(shape=(50,), dtype=tf.float32, name=None))


In [7]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5,
                               verbose=1, mode='auto', restore_best_weights=True)

checkpoint = ModelCheckpoint(os.path.join(
    os.getcwd(), 'weights', 'new', 'ffnn_e{epoch:02d}v{val_loss:.4f}'),
    monitor='val_loss', save_best_only=True)

# 定義 FFNN 模型
model = tf.keras.Sequential([
    # 原始模型
    # tf.keras.layers.Dense(32, activation='sigmoid', input_shape=(12,)),  # 輸入層：輸入 12 個值 (2 個時間步 × 6 個變數)
    # tf.keras.layers.Dense(60)  # 輸出層：輸出 60 個值 (12 個時間步 × 5 個變數)

    # 修改後的模型
    # 輸入層：輸入 4 個值 (2 個時間步 × 2 個變數)
    tf.keras.layers.Input(shape=(14,), name='ffnn_input'),
    # 第一層：64 個神經元，ReLU 激活，並加入 BatchNormalization 與 Dropout
    tf.keras.layers.Dense(64, activation='sigmoid', name='ffnn_dense1'),
    tf.keras.layers.BatchNormalization(name='ffnn_bn1'),
    tf.keras.layers.Dropout(0.2, name='ffnn_dropout1'),

    # 第二層：32 個神經元，ReLU 激活
    tf.keras.layers.Dense(64, activation='sigmoid', name='ffnn_dense2'),
    tf.keras.layers.BatchNormalization(name='ffnn_bn2'),
    tf.keras.layers.Dropout(0.2, name='ffnn_dropout2'),
    # 輸出層：輸出 20 個值 (10 個時間步 × 2 個變數)
    tf.keras.layers.Dense(50, name='ffnn_output')

    # tf.keras.layers.Dense(32, activation='sigmoid', input_shape=(4,)), # 輸入層：輸入 4 個值 (2 個時間步 × 2 個變數)
    # tf.keras.layers.Dense(10)  # 輸出層：輸出 10 個值 (5 個時間步 × 2 個變數)
], name='ffnn')

# 編譯模型，使用 Adam 優化器 (學習率 0.001) 和 MSE 損失函數
model.compile(optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.001), loss='mse', metrics=['mse', rmse, 'mae'])
model.summary()

Model: "ffnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 ffnn_dense1 (Dense)         (None, 64)                960       
                                                                 
 ffnn_bn1 (BatchNormalizatio  (None, 64)               256       
 n)                                                              
                                                                 
 ffnn_dropout1 (Dropout)     (None, 64)                0         
                                                                 
 ffnn_dense2 (Dense)         (None, 64)                4160      
                                                                 
 ffnn_bn2 (BatchNormalizatio  (None, 64)               256       
 n)                                                              
                                                                 
 ffnn_dropout2 (Dropout)     (None, 64)                0      

In [8]:
# 設定訓練參數
batch_size = 16
epochs = 50

# 使用 .batch() 和 .prefetch() 進行數據集的優化加載
train_dataset = train_dataset.batch(batch_size)\
                .prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_dataset.batch(batch_size)\
                .prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
# 開始訓練模型
history = model.fit(
    train_dataset,
    validation_data=val_dataset,  # 傳入驗證集
    epochs=epochs,
    callbacks=[early_stopping, checkpoint],
    verbose=1  # 訓練過程中打印進度
)

In [10]:
feature_columns = ['Latitude', 'Longitude', 'VIL', 'MaxdBZ', 'CellVolume', 'UMotion', 'VMotion']  # 欄位名稱
label_columns = ['Latitude', 'Longitude', 'VIL', 'MaxdBZ', 'CellVolume']  # 欄位名稱

# 經緯度 CSV 路徑
cells_csv_path = r'H:\cell_data_processed\cells'

# 創建經緯度滑動窗口數據集
# scaler = fit_scaler_on_training_data(column_name, cells_csv_path)
# joblib.dump(scaler, 'ffnn_scaler.gz')  # 保存scaler
feature_scaler = joblib.load('ffnn_feature_scaler.gz')  # 加載scaler
label_scaler = joblib.load('ffnn_label_scaler.gz')  # 加載scaler

# 創建csv數據集
test_dataset = create_csv_sliding_window_dataset_ffnn(
    cells_csv_path, feature_columns, label_columns, feature_scaler, label_scaler, mode='test')

In [11]:
test_dataset.element_spec

(TensorSpec(shape=(14,), dtype=tf.float32, name=None),
 TensorSpec(shape=(50,), dtype=tf.float32, name=None))

In [12]:
batch_size = 16
# 使用 .batch() 和 .prefetch() 進行數據集的優化加載
test_dataset = test_dataset.batch(batch_size)\
                .prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
model_path = os.path.join(os.getcwd(), r'weights\new\ffnn_e08v0.3158')
if os.path.exists(model_path):
    model = load_model(model_path, custom_objects={'rmse': rmse})
    print('Load model successfully!')
    print(model.summary())

Load model successfully!
Model: "ffnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 ffnn_dense1 (Dense)         (None, 64)                960       
                                                                 
 ffnn_bn1 (BatchNormalizatio  (None, 64)               256       
 n)                                                              
                                                                 
 ffnn_dropout1 (Dropout)     (None, 64)                0         
                                                                 
 ffnn_dense2 (Dense)         (None, 64)                4160      
                                                                 
 ffnn_bn2 (BatchNormalizatio  (None, 64)               256       
 n)                                                              
                                                                 
 ffnn_dropout2 (Dropout)     (None, 6

In [14]:
# evaluation = model.evaluate(test_dataset)

# print(f'Total Loss: {evaluation[0]:.5f}')         # 總損失

# print(f'MSE: {evaluation[1]:.5f}')       # MSE

# print(f'RMSE: {evaluation[2]:.5f}')      # RMSE

# print(f'MAE: {evaluation[3]:.5f}')       # MAE

In [15]:
train_cases_to_exclude = {'20210116', '20210530',
                          '20210825', '20210722', '20220904'}
test_files = [
    csv for csv in os.listdir(cells_csv_path) 
    if csv.startswith(('202210', '202211', '202212')) or any(csv.startswith(case) for case in train_cases_to_exclude)
]

In [None]:
import os
import numpy as np
import pandas as pd
from haversine import haversine
from sklearn.metrics import mean_squared_error, mean_absolute_error

all_true = []
all_pred = []

for csv_file in test_files:
    csv_file_path = os.path.join(cells_csv_path, csv_file)
    df = pd.read_csv(csv_file_path)

    if len(df) < 12:
        continue

    df.replace(-99900, 0, inplace=True)

    x = df[feature_columns].values
    y = df[label_columns].values

    scaled_x = feature_scaler.transform(x)
    scaled_y = label_scaler.transform(y)

    inputs = []
    true_outputs = []

    for i in range(len(scaled_x) - 11):
        input_sample = [scaled_x[i], scaled_x[i + 1]]
        label_sample = scaled_y[i + 2:i + 12]  # 10 步

        inputs.append(input_sample)
        true_outputs.append(label_sample)

    inputs = np.array(inputs).reshape(-1, 14)  # (N, 14)
    true_outputs = np.array(true_outputs).reshape(-1, 10, 5)  # (N, 10, 5)

    preds = model.predict(inputs)  # (N, 50)
    preds_unscaled = label_scaler.inverse_transform(preds.reshape(-1, 5)).reshape(-1, 10, 5)
    true_unscaled = label_scaler.inverse_transform(true_outputs.reshape(-1, 5)).reshape(-1, 10, 5)

    # 僅取 Latitude 和 Longitude（第 0 和 1 欄）
    all_pred.append(preds_unscaled[:, :, :2])
    all_true.append(true_unscaled[:, :, :2])

# 合併所有結果
all_pred = np.concatenate(all_pred, axis=0)  # (total_samples, 10, 2)
all_true = np.concatenate(all_true, axis=0)  # (total_samples, 10, 2)

# 分離經緯度
pred_lats = all_pred[:, :, 0]
pred_lngs = all_pred[:, :, 1]
true_lats = all_true[:, :, 0]
true_lngs = all_true[:, :, 1]

# MSE / MAE / RMSE 計算
mse_lat = mean_squared_error(true_lats.flatten(), pred_lats.flatten())
mse_lng = mean_squared_error(true_lngs.flatten(), pred_lngs.flatten())
mae_lat = mean_absolute_error(true_lats.flatten(), pred_lats.flatten())
mae_lng = mean_absolute_error(true_lngs.flatten(), pred_lngs.flatten())
rmse_lat = np.sqrt(mse_lat)
rmse_lng = np.sqrt(mse_lng)

print(f'=== 經緯度位置（還原後）總體評估 ===')
print(f'Latitude MSE: {mse_lat:.6f} 度')
print(f'Longitude MSE: {mse_lng:.6f} 度')
print(f'Latitude MAE: {mae_lat:.6f} 度')
print(f'Longitude MAE: {mae_lng:.6f} 度')
print(f'Latitude RMSE: {rmse_lat:.6f}')
print(f'Longitude RMSE: {rmse_lng:.6f}')

# === 平均 Haversine 距離 ===
haversine_distances = []
for i in range(true_lats.shape[0]):
    for t in range(true_lats.shape[1]):
        true_point = (true_lats[i, t], true_lngs[i, t])
        pred_point = (pred_lats[i, t], pred_lngs[i, t])
        dist = haversine(true_point, pred_point)
        haversine_distances.append(dist)

avg_haversine_error = np.mean(haversine_distances)
print(f'Average Haversine distance error: {avg_haversine_error:.6f} km')


=== 經緯度位置（還原後）總體評估 ===
Latitude MSE: 0.027096 度
Longitude MSE: 0.042968 度
Latitude MAE: 0.123466 度
Longitude MAE: 0.165852 度
Latitude RMSE: 0.164608
Longitude RMSE: 0.207287
Average Haversine distance error: 23.628181 km


In [None]:
import os
import numpy as np
import pandas as pd
from haversine import haversine
from sklearn.metrics import mean_squared_error, mean_absolute_error

all_true = []
all_pred = []

for csv_file in test_files:
    csv_file_path = os.path.join(cells_csv_path, csv_file)
    df = pd.read_csv(csv_file_path)

    if len(df) < 12:
        continue

    df.replace(-99900, 0, inplace=True)

    x = df[feature_columns].values
    y = df[label_columns].values

    scaled_x = feature_scaler.transform(x)
    scaled_y = label_scaler.transform(y)

    inputs = []
    true_outputs = []

    for i in range(len(scaled_x) - 11):
        input_sample = [scaled_x[i], scaled_x[i + 1]]
        label_sample = scaled_y[i + 2:i + 12]  # 10 步

        inputs.append(input_sample)
        true_outputs.append(label_sample)

    inputs = np.array(inputs).reshape(-1, 14)  # (N, 14)
    true_outputs = np.array(true_outputs).reshape(-1, 10, 5)  # (N, 10, 5)

    preds = model.predict(inputs)  # (N, 50)
    preds_unscaled = label_scaler.inverse_transform(preds.reshape(-1, 5)).reshape(-1, 10, 5)
    true_unscaled = label_scaler.inverse_transform(true_outputs.reshape(-1, 5)).reshape(-1, 10, 5)

    # 僅取第一步的 Latitude 和 Longitude（第 0 和 1 欄）
    all_pred.append(preds_unscaled[:, 0, :2])  # 取第 1 步的預測值
    all_true.append(true_unscaled[:, 0, :2])  # 取第 1 步的真實值

# 合併所有結果
all_pred = np.concatenate(all_pred, axis=0)  # (total_samples, 2)
all_true = np.concatenate(all_true, axis=0)  # (total_samples, 2)

# 分離經緯度
pred_lats = all_pred[:, 0]
pred_lngs = all_pred[:, 1]
true_lats = all_true[:, 0]
true_lngs = all_true[:, 1]

# MSE / MAE / RMSE 計算
mse_lat = mean_squared_error(true_lats, pred_lats)
mse_lng = mean_squared_error(true_lngs, pred_lngs)
mae_lat = mean_absolute_error(true_lats, pred_lats)
mae_lng = mean_absolute_error(true_lngs, pred_lngs)
rmse_lat = np.sqrt(mse_lat)
rmse_lng = np.sqrt(mse_lng)

print(f'=== 第一個預測步的經緯度位置評估 ===')
print(f'Latitude MSE: {mse_lat:.6f} 度')
print(f'Longitude MSE: {mse_lng:.6f} 度')
print(f'Latitude MAE: {mae_lat:.6f} 度')
print(f'Longitude MAE: {mae_lng:.6f} 度')
print(f'Latitude RMSE: {rmse_lat:.6f}')
print(f'Longitude RMSE: {rmse_lng:.6f}')

# === Haversine 距離計算 ===
haversine_distances = []
for i in range(true_lats.shape[0]):
    true_point = (true_lats[i], true_lngs[i])
    pred_point = (pred_lats[i], pred_lngs[i])
    dist = haversine(true_point, pred_point)
    haversine_distances.append(dist)

avg_haversine_error = np.mean(haversine_distances)
print(f'Average Haversine distance error: {avg_haversine_error:.6f} km')


=== 第一個預測步的經緯度位置（還原後）評估 ===
Latitude MSE: 0.017899 度
Longitude MSE: 0.023575 度
Latitude MAE: 0.101858 度
Longitude MAE: 0.117315 度
Latitude RMSE: 0.133789
Longitude RMSE: 0.153542
Average Haversine distance error: 17.684342 km
