In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import adfuller

def preprocess_data(data_path):
    """
    Melakukan preprocessing pada dataset time series.

    Args:
        data_path (str): Path menuju file CSV dataset.

    Returns:
        tuple: Tuple berisi data yang sudah di-preprocess, scaler, dan order differencing.
               (train_data, test_data, train_exog, test_exog, scaler_y, scaler_x, target_diff_order, exog_diff_orders)
    """

    # Load Dataset
    df = pd.read_csv(data_path)
    df['periode'] = pd.to_datetime(df['periode'])
    df.set_index('periode', inplace=True)

    # Time Series Data
    time_series = df['jumlah_kasus']

    # Exogenous Variables
    exog_data = df[['Tavg', 'RH_avg', 'RR']]

    # Scaling
    scaler_y = MinMaxScaler()
    scaler_x = MinMaxScaler()
    scaled_time_series = scaler_y.fit_transform(time_series.values.reshape(-1, 1)).flatten()
    scaled_exog_data = scaler_x.fit_transform(exog_data)

     # Fungsi untuk Cek Stasionaritas dan Differencing
    def check_stationarity(series, name, max_diff=2):
        d = 0
        temp_series = series.copy()
        while d <= max_diff:
            result = adfuller(temp_series)
            if result[1] <= 0.05:
                print(f"{name} sudah stasioner setelah differencing {d} kali.")
                return temp_series, d
            else:
                print(f"{name} tidak stasioner, differencing ke-{d+1}.")
                temp_series = np.diff(temp_series)
                d += 1
        raise ValueError(f"{name} tidak stasioner setelah {max_diff} kali differencing. Mungkin butuh transformasi lain.")

    # Cek Stasionaritas Target
    scaled_time_series, target_diff_order = check_stationarity(scaled_time_series, "Target (Jumlah Kasus)")

    # Cek Stasionaritas Exogenous dan lakukan differencing jika perlu
    exogenous_scaled_processed = {}  # Ubah menjadi dictionary
    exog_diff_orders = {}     # Ubah menjadi dictionary
    for i, col in enumerate(exog_data.columns):
        exog_series, diff_order = check_stationarity(scaled_exog_data[:, i], f"Exogenous ({col})")
        exogenous_scaled_processed[col] = exog_series #simpan per kolom
        exog_diff_orders[col] = diff_order #simpan per kolom

    # Pad series eksogen dengan NaN agar memiliki panjang yang sama
    max_len = max(len(series) for series in exogenous_scaled_processed.values())
    for col, series in exogenous_scaled_processed.items():
        if len(series) < max_len:
            padding_len = max_len - len(series)
            exogenous_scaled_processed[col] = np.concatenate([series, np.full(padding_len, np.nan)])

    exogenous_scaled_processed = pd.DataFrame(exogenous_scaled_processed) #ubah jadi dataframe
    # Train-Test Split
    train_size = int(len(scaled_time_series) * 0.8)
    train_data = scaled_time_series[:train_size]
    test_data = scaled_time_series[train_size:]
    train_exog = exogenous_scaled_processed.iloc[:train_size]
    test_exog = exogenous_scaled_processed.iloc[train_size:]
    return train_data, test_data, train_exog, test_exog, scaler_y, scaler_x, target_diff_order, exog_diff_orders


if __name__ == '__main__':
    data_path = r"C:\Users\ASUS\Downloads\ARIMA\data_kasus_dbd_dki_jakarta_2015_2020.csv"
    train_data, test_data, train_exog, test_exog, scaler_y, scaler_x, target_diff_order, exog_diff_orders = preprocess_data(data_path)

    print("Shape of train_data:", train_data.shape)
    print("Shape of test_data:", test_data.shape)
    print("Shape of train_exog:", train_exog.shape)
    print("Shape of test_exog:", test_exog.shape)
    print("Target diff order:", target_diff_order)
    print("Exog diff orders:", exog_diff_orders)

Target (Jumlah Kasus) sudah stasioner setelah differencing 0 kali.
Exogenous (Tavg) tidak stasioner, differencing ke-1.
Exogenous (Tavg) sudah stasioner setelah differencing 1 kali.
Exogenous (RH_avg) sudah stasioner setelah differencing 0 kali.
Exogenous (RR) sudah stasioner setelah differencing 0 kali.
Shape of train_data: (57,)
Shape of test_data: (15,)
Shape of train_exog: (57, 3)
Shape of test_exog: (15, 3)
Target diff order: 0
Exog diff orders: {'Tavg': 1, 'RH_avg': 0, 'RR': 0}


In [3]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import pickle
import matplotlib.pyplot as plt
from preprocessing import preprocess_data # import fungsi preprocessing

# Load Preprocessed Data and Scaler
data_path = r"C:\Users\ASUS\Downloads\ARIMA\data_kasus_dbd_dki_jakarta_2015_2020.csv"
train_data, test_data, train_exog, test_exog, scaler_y, scaler_x, target_diff_order, exog_diff_orders = preprocess_data(data_path)

# Load original Data for plot
df = pd.read_csv(data_path)
df['periode'] = pd.to_datetime(df['periode'])
df.set_index('periode', inplace=True)
time_series = df['jumlah_kasus']

# Grid Search for Optimal Parameters
from itertools import product

def grid_search_arima(train_data, train_exog):
    p = d = q = range(0, 3)
    P = D = Q = range(0, 2)
    seasonal_m = [12]

    best_score = float('inf')
    best_params = None
    for order in product(p, d, q):
        for seasonal_order in product(P, D, Q, seasonal_m):
            try:
                model = SARIMAX(train_data, exog=train_exog, order=order, seasonal_order=seasonal_order,
                                enforce_stationarity=False, enforce_invertibility=False)
                result = model.fit(disp=False)
                predictions = result.fittedvalues
                score = mean_squared_error(train_data, predictions)
                if score < best_score:
                    best_score = score
                    best_params = (order, seasonal_order)
            except:
                continue
    return best_params

best_order, best_seasonal_order = grid_search_arima(train_data, train_exog)
print(f"Best Parameters: {best_order}, Seasonal: {best_seasonal_order}")

# Train Final Model
model = SARIMAX(train_data, exog=train_exog, order=best_order, seasonal_order=best_seasonal_order,
                enforce_stationarity=False, enforce_invertibility=False)
result = model.fit(disp=False)

# Save Model
model_save_path = r"C:\Users\ASUS\Aplikasi_Skripsi\saved_models\sarimax_model_with_weather.pkl"
with open(model_save_path, "wb") as f:
    pickle.dump({"model": result, "scaler_y": scaler_y, "scaler_x": scaler_x,
                 "target_diff_order": target_diff_order, "exog_diff_orders": exog_diff_orders}, f)
print(f"Model saved at {model_save_path}")

# Predict Test Data
forecast_steps = len(test_data)
forecast = result.get_forecast(steps=forecast_steps, exog=test_exog)
forecast_values = forecast.predicted_mean
forecast_values_original_scale = scaler_y.inverse_transform(forecast_values.values.reshape(-1, 1)).flatten()

# Confidence Interval
conf_int = forecast.conf_int()

lower_bound = scaler_y.inverse_transform(conf_int.iloc[:, 0].values.reshape(-1, 1)).flatten()
upper_bound = scaler_y.inverse_transform(conf_int.iloc[:, 1].values.reshape(-1, 1)).flatten()

# Actual Values
actual_values = time_series[len(train_data):].values

# Evaluasi Model
mse = mean_squared_error(actual_values, forecast_values_original_scale)
rmse = np.sqrt(mse)
mae = mean_absolute_error(actual_values, forecast_values_original_scale)
r2 = r2_score(actual_values, forecast_values_original_scale)
mape = mean_absolute_percentage_error(actual_values, forecast_values_original_scale) * 100

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2: {r2:.4f}")
print(f"MAPE: {mape:.2f}%")

# Plot
plt.figure(figsize=(12, 6))
plt.plot(time_series.index[:len(train_data)], scaler_y.inverse_transform(train_data.reshape(-1, 1)).flatten(),
         label="Train Data")
plt.plot(time_series.index[len(train_data):], actual_values, label="Actual Test Data")
plt.plot(time_series.index[len(train_data):], forecast_values_original_scale, label="Predicted Test Data")
plt.fill_between(time_series.index[len(train_data):], lower_bound, upper_bound, color="orange", alpha=0.2,
                 label="Confidence Interval")
plt.legend()
plt.title("SARIMAX Model with Weather Data - Actual vs Predicted")
plt.xlabel("Time")
plt.ylabel("Values")
plt.show()

ModuleNotFoundError: No module named 'preprocessing'