In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pmdarima as pm
from scipy.stats import boxcox
from scipy.special import inv_boxcox

# Additional function for outlier detection using Z-score
def remove_outliers_z_score(data, threshold=3):
    z_scores = np.abs(stats.zscore(data))
    return data[(z_scores < threshold)]

# Read gold price data from a csv file
csv_file = 'SeaData\\bias\大西洋暖流(50W,31.3N).csv'
data = pd.read_csv(csv_file, index_col='date', parse_dates=True)

# Prepare the data
data = data['biastg']
data = data.asfreq('B').fillna(method='ffill')  # Fill missing values

# Additional feature engineering
# Add lagged values and rolling window statistics
data = data.to_frame()
for i in range(1, 4):
    data[f"lag_{i}"] = data["biastg"].shift(i)
    data[f"rolling_mean_{i}"] = data["biastg"].rolling(window=i).mean()
    data[f"rolling_std_{i}"] = data["biastg"].rolling(window=i).std()

# Remove rows with missing values (due to lagged features and rolling window)
data = data.dropna()

# Outlier detection and removal using Z-score
data_no_outliers = remove_outliers_z_score(data, threshold=3)

print(f"Data points remaining after outlier removal: {len(data_no_outliers)}")


# Data transformation
# Apply Box-Cox transformation
data_transformed, lambda_value = boxcox(data_no_outliers["biastg"])

# Split the data into training and testing sets
train_data = data_transformed[:int(0.8 * len(data_transformed))]
test_data = data_transformed[int(0.8 * len(data_transformed)):]

# Perform stepwise search
stepwise_model = pm.auto_arima(train_data, start_p=0, start_q=0, start_P=0, start_Q=0,
                               max_p=3, max_q=3, max_P=3, max_Q=3, seasonal=True,
                               stepwise=True, suppress_warnings=True, D=1, max_D=1,
                               error_action='ignore', trace=True, m=12,
                               exogenous=data_no_outliers.drop(columns=["biastg"]).iloc[:int(0.8 * len(data_transformed)), :])

print(f"Best SARIMAX parameters: {stepwise_model.order}, {stepwise_model.seasonal_order}")

# Fit the best model and make predictions
best_model = SARIMAX(data_transformed, order=stepwise_model.order, seasonal_order=stepwise_model.seasonal_order,
                     exog=data_no_outliers.drop(columns=["biastg"]))
best_results = best_model.fit()
predictions = best_results.predict(start=test_data.index[0], end=test_data.index[-1],
                                    exog=data_no_outliers.drop(columns=["biastg"]).iloc[int(0.8 * len(data_transformed)):, :])

# Invert Box-Cox transformation
predictions_inv = inv_boxcox(predictions, lambda_value)

# Evaluate the model
mae = mean_absolute_error(data_no_outliers["biastg"].iloc[int(0.8 * len(data_transformed)):], predictions_inv)


Data points remaining after outlier removal: 0


ValueError: not enough values to unpack (expected 2, got 0)