Forecast Accuracy:
2024-08-15 16:59:27,482 - INFO - RMSE: 32675064.6888
2024-08-15 16:59:27,483 - INFO - MAE: 29282264.3667

In [1]:
import pandas as pd
import numpy as np

# 读取CSV文件
df = pd.read_csv('/root/Download/AlgaeBloomForecast/wuguishan.csv', parse_dates=['date'])

# 将日期设置为索引
df.set_index('date', inplace=True)

# 按日期分组并计算每日平均值
daily_df = df.resample('D').mean()

# 对缺失值进行线性插值
daily_df_interpolated = daily_df.interpolate(method='linear')

# 重置索引,使日期成为一列
daily_df_interpolated.reset_index(inplace=True)

# 将日期格式化为原始格式
daily_df_interpolated['date'] = daily_df_interpolated['date'].dt.strftime('%Y/%m/%d')

# 保存结果到新的CSV文件
daily_df_interpolated.to_csv('smoothed_data.csv', index=False)

# 打印前几行结果
print(daily_df_interpolated.head())

         date       temp    oxygen       NH3        TP        TN         algae
0  2021/06/02  26.187500  6.666500  0.025000  0.068275  1.073250  1.440000e+07
1  2021/06/03  25.881667  6.641833  0.025117  0.063783  0.915167  1.086709e+07
2  2021/06/04  25.895000  7.946333  0.025000  0.063783  0.920333  2.549842e+07
3  2021/06/05  26.850000  9.084000  0.025000  0.047760  0.905800  2.110000e+07
4  2021/06/06  28.256667  9.514333  0.025000  0.044067  0.923333  1.521134e+07


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from prophet import Prophet
import matplotlib.pyplot as plt
from datetime import timedelta
import logging
import torch
import os
from sklearn.metrics import mean_squared_error  # 添加这一行
from sklearn.metrics import mean_absolute_error  # 添加这一行


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def preprocess_data(df):
    df['ds'] = pd.to_datetime(df['date'])
    df.set_index('ds', inplace=True)
    df = df.ffill()
    
    features = ['temp', 'oxygen', 'NH3', 'TP', 'TN', 'algae']
    df_features = df[features]
    
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df_features), columns=features, index=df.index)
    
    # Add lagged temperature
    df_scaled['temp_lag'] = df_scaled['temp'].shift(1)
    df_scaled.dropna(inplace=True)
    
    return df_scaled, features, scaler

def prepare_prophet_data(df):
    prophet_df = df.reset_index()
    prophet_df = prophet_df.rename(columns={'ds': 'ds', 'algae': 'y'})
    return prophet_df

def train_prophet_model(train_df, use_gpu=True):
    if use_gpu and torch.cuda.is_available():
        logging.info("Using GPU for training")
        os.environ['PROPHET_USE_GPU'] = 'true'
    else:
        logging.info("Using CPU for training")
        os.environ['PROPHET_USE_GPU'] = 'false'

    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=True
    )
    
    # Add regressors
    for col in train_df.columns:
        if col not in ['ds', 'y']:
            model.add_regressor(col)
    
    model.fit(train_df)
    
    return model

def forecast_algae(model, future_df):
    forecast = model.predict(future_df)
    return forecast

def inverse_transform_feature(scaler, data, feature_index):
    dummy = np.zeros((len(data), len(scaler.scale_)))
    dummy[:, feature_index] = data
    return scaler.inverse_transform(dummy)[:, feature_index]

def plot_forecast(original_data, forecast, feature_name):
    plt.figure(figsize=(12, 6))
    plt.plot(original_data.index, original_data, label='Observed')
    plt.plot(forecast['ds'], forecast['yhat'], label='Forecast', color='red')
    plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color='red', alpha=0.2)
    
    plt.title(f'{feature_name} Forecast')
    plt.xlabel('Date')
    plt.ylabel(feature_name)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'{feature_name}_forecast.png')
    plt.close()

def evaluate_forecast(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

def main():
    try:
        df = pd.read_csv('/root/Download/AlgaeBloomForecast/smoothed_data.csv', encoding='utf-8')
        logging.info("Successfully read the file")
        df = df.fillna(method='ffill')  # 使用前向填充方法填充NaN值
        
        df_scaled, features, scaler = preprocess_data(df)
        prophet_df = prepare_prophet_data(df_scaled)
        
        # Split data into train and test sets
        train_size = int(len(prophet_df) * 0.8)
        train_df = prophet_df[:train_size]
        test_df = prophet_df[train_size:]
        
        # Train the model
        model = train_prophet_model(train_df)
        
        # Forecast for the test period
        future_df = model.make_future_dataframe(periods=len(test_df))
        for col in prophet_df.columns:
            if col not in ['ds', 'y']:
                future_df[col] = prophet_df[col].reindex(future_df.index)
        
        forecast = forecast_algae(model, future_df)
        
        # Inverse transform the forecast
        algae_index = features.index('algae')
        forecast['yhat'] = inverse_transform_feature(scaler, forecast['yhat'].values, algae_index)
        forecast['yhat_lower'] = inverse_transform_feature(scaler, forecast['yhat_lower'].values, algae_index)
        forecast['yhat_upper'] = inverse_transform_feature(scaler, forecast['yhat_upper'].values, algae_index)
        
        # Evaluate the forecast
        test_forecast = forecast.iloc[-len(test_df):]
        y_true = df['algae'].iloc[-len(test_df):].values
        y_pred = test_forecast['yhat'].values
        
        rmse, mae = evaluate_forecast(y_true, y_pred)
        logging.info(f"\nForecast Accuracy:")
        logging.info(f"RMSE: {rmse:.4f}")
        logging.info(f"MAE: {mae:.4f}")
        
        logging.info("\nAlgae bloom forecast for the next 30 days:")
        logging.info(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(30))
        
        # Plot the forecast
        original_algae = df['algae']
        plot_forecast(original_algae, forecast, 'Algae Bloom')
        logging.info("Forecast plot saved as 'Algae_Bloom_forecast.png'")
        
    except Exception as e:
        logging.error(f"An error occurred in main: {str(e)}")
        import traceback
        logging.error(traceback.format_exc())

if __name__ == "__main__":
    main()

2024-08-15 17:01:47,758 - INFO - Successfully read the file
  df = df.fillna(method='ffill')  # 使用前向填充方法填充NaN值
2024-08-15 17:01:47,764 - INFO - Using GPU for training
2024-08-15 17:01:47,782 - DEBUG - input tempfile: /tmp/tmppbqe01su/w8n3xe51.json
2024-08-15 17:01:47,807 - DEBUG - input tempfile: /tmp/tmppbqe01su/ir7l2cf_.json
2024-08-15 17:01:47,808 - DEBUG - idx 0
2024-08-15 17:01:47,809 - DEBUG - running CmdStan, num_threads: None
2024-08-15 17:01:47,809 - DEBUG - CmdStan args: ['/root/.conda/envs/hydro/envs/algae/lib/python3.12/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=98767', 'data', 'file=/tmp/tmppbqe01su/w8n3xe51.json', 'init=/tmp/tmppbqe01su/ir7l2cf_.json', 'output', 'file=/tmp/tmppbqe01su/prophet_modeli2zm03os/prophet_model-20240815170147.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:01:47 - cmdstanpy - INFO - Chain [1] start processing
2024-08-15 17:01:47,809 - INFO - Chain [1] start processing
17:01:47 - cmdstanpy - INFO - Chain [1] 