Forecast Accuracy:
2024-08-15 17:01:39,392 - INFO - RMSE: 32675064.6888
2024-08-15 17:01:39,393 - INFO - MAE: 29282264.3667

原本的CSV字段只有这些 ['temp', 'oxygen', 'NH3', 'TP', 'TN', 'algae']，但现在我们需要制作了一个新的csv：

date,temp,oxygen,NH3,TP,TN,algae,area,weather,max_temperature,min_temperature,aqi,aqiLevel,wind_direction,wind_power,aqiInfo
2021-06-02,26.1875,6.6665,0.025,0.068275,1.07325,14400000.0,无锡,阴-阵雨,26,21,24,1,东南风,4级,优
2021-06-03,25.881666666666664,6.6418333333333335,0.0251166666666666,0.0637833333333333,0.9151666666666666,10867091.666666666,无锡,阴-阵雨,26,19,66,2,西北风,3级,良
2021-06-04,25.895,7.946333333333333,0.025,0.0637833333333333,0.9203333333333332,25498423.33333333,无锡,阴-多云,26,18,51,2,西南风,3级,良
2021-06-05,26.85,9.084,0.025,0.04776,0.9058,21100000.0,无锡,晴,32,19,67,2,西南风,3级,良
2021-06-06,28.256666666666664,9.514333333333331,0.025,0.0440666666666666,0.9233333333333332,15211340.0,无锡,晴,33,19,80,2,南风,3级,良
2021-06-07,27.635,8.3865,0.025,0.0366499999999999,0.7778333333333333,7994458.333333333,无锡,阴-多云,35,21,68,2,东南风,3级,良
2021-06-08,28.19666666666667,8.397499999999999,0.025,0.0418666666666666,0.7323333333333334,12259158.333333334,无锡,阴-多云,30,24,36,1,东南风,3级,优
2021-06-09,28.751666666666665,8.309166666666668,0.025,0.0389833333333333,0.601,6891956.666666667,无锡,阴-雷阵雨,32,24,52,2,东南风,3级,良
2021-06-10,28.741666666666664,7.385833333333333,0.025,0.03785,0.5256666666666666,6301236.666666667,无锡,阴,28,24,38,1,东南风,2级,优
2021-06-11,29.491666666666664,7.6176666666666675,0.025,0.0327666666666666,0.4495,6244151.666666667,无锡,阴-多云,32,23,82,2,东风,2级,良
2021-06-12,29.58666666666667,7.271999999999999,0.025,0.02975,0.3741666666666667,4201731.666666667,无锡,多云-雷阵雨,33,24,41,1,东南风,3级,优
2021-06-13,29.563333333333333,6.929333333333333,0.025,0.0302833333333333,0.2663333333333333,4964940.0,无锡,阴-小雨,28,25,34,1,西南风,2级,优
2021-06-14,29.58833333333333,6.963166666666666,0.025,0.0290666666666666,0.1886666666666666,5394340.0,无锡,阴-小雨,31,25,46,1,东南风,3级,优
2021-06-15,30.21,7.23925,0.025,0.033425,0.396,6927237.5,无锡,阴-小雨,33,24,48,1,西南风,3级,优

这里我们要用Prophet预测藻类的爆发。这里需要考虑weather这一列出现“晴”的影响，以及“晴”滞后0~4天的影响。以及温度的影响，这里，只采用temp作为特征，考虑温度滞后10~20天的影响，而不考虑max_temperature,min_temperature。这里的aqi,aqiLevel，aqiInfo与空气质量相关，也不考虑。wind_power大于4级时是一个需要考虑的因素，滞后2~6天为负相关，滞后10~16天为正相关。wind_direction也不考虑。

请你修改下面的代码

# 使用Prophet预测藻类的爆发

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from prophet import Prophet
import matplotlib.pyplot as plt
from datetime import timedelta
import logging
import torch
import os
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def preprocess_data(df):
    df['ds'] = pd.to_datetime(df['date'])
    df.set_index('ds', inplace=True)
    df = df.ffill()

    # Convert wind_power to numeric by removing the '级' character and converting to integer
    df['wind_power'] = df['wind_power'].str.replace('级', '').astype(int)

    # Add binary feature for sunny weather
    df['sunny'] = df['weather'].apply(lambda x: 1 if '晴' in x else 0)

    # Add lagged sunny weather
    for lag in range(5):
        df[f'sunny_lag_{lag}'] = df['sunny'].shift(lag)

    # Add lagged temperature
    for lag in range(10, 21):
        df[f'temp_lag_{lag}'] = df['temp'].shift(lag)

    # Add binary feature for wind power > 4
    df['wind_power_gt_4'] = df['wind_power'].apply(lambda x: 1 if x > 4 else 0)

    # Add lagged wind power effect
    for lag in range(2, 7):
        df[f'wind_power_gt_4_neg_lag_{lag}'] = df['wind_power_gt_4'].shift(lag)
    for lag in range(10, 17):
        df[f'wind_power_gt_4_pos_lag_{lag}'] = df['wind_power_gt_4'].shift(lag)

    features = ['temp', 'oxygen', 'NH3', 'TP', 'TN', 'algae']
    df_features = df[features + [col for col in df.columns if 'lag' in col or 'sunny' in col or 'wind_power_gt_4' in col]]

    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df_features), columns=df_features.columns, index=df.index)

    df_scaled.dropna(inplace=True)

    return df_scaled, features, scaler

def prepare_prophet_data(df):
    prophet_df = df.reset_index()
    prophet_df = prophet_df.rename(columns={'ds': 'ds', 'algae': 'y'})
    return prophet_df

def train_prophet_model(train_df, use_gpu=True):
    if use_gpu and torch.cuda.is_available():
        logging.info("Using GPU for training")
        os.environ['PROPHET_USE_GPU'] = 'true'
    else:
        logging.info("Using CPU for training")
        os.environ['PROPHET_USE_GPU'] = 'false'

    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=True
    )

    # Add regressors
    for col in train_df.columns:
        if col not in ['ds', 'y']:
            model.add_regressor(col)

    model.fit(train_df)

    return model

# ... rest of the code remains unchanged ...

def forecast_algae(model, future_df):
    forecast = model.predict(future_df)
    return forecast

def inverse_transform_feature(scaler, data, feature_index):
    dummy = np.zeros((len(data), len(scaler.scale_)))
    dummy[:, feature_index] = data
    return scaler.inverse_transform(dummy)[:, feature_index]

def plot_forecast(original_data, forecast, feature_name):
    plt.figure(figsize=(12, 6))
    plt.plot(original_data.index, original_data, label='Observed')
    plt.plot(forecast['ds'], forecast['yhat'], label='Forecast', color='red')
    plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color='red', alpha=0.2)
    
    plt.title(f'{feature_name} Forecast')
    plt.xlabel('Date')
    plt.ylabel(feature_name)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'{feature_name}_forecast.png')
    plt.close()

def evaluate_forecast(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

def main():
    try:
        df = pd.read_csv('/root/Download/AlgaeBloomForecast/merged_data.csv', encoding='utf-8')
        logging.info("Successfully read the file")
        df = df.fillna(method='ffill')  # 使用前向填充方法填充NaN值
        
        df_scaled, features, scaler = preprocess_data(df)
        prophet_df = prepare_prophet_data(df_scaled)
        
        # Split data into train and test sets
        train_size = int(len(prophet_df) * 0.8)
        train_df = prophet_df[:train_size]
        test_df = prophet_df[train_size:]
        
        # Train the model
        model = train_prophet_model(train_df)
        
        # Forecast for the test period
        future_df = model.make_future_dataframe(periods=len(test_df))
        for col in prophet_df.columns:
            if col not in ['ds', 'y']:
                future_df[col] = prophet_df[col].reindex(future_df.index)
        
        forecast = forecast_algae(model, future_df)
        
        # Inverse transform the forecast
        algae_index = features.index('algae')
        forecast['yhat'] = inverse_transform_feature(scaler, forecast['yhat'].values, algae_index)
        forecast['yhat_lower'] = inverse_transform_feature(scaler, forecast['yhat_lower'].values, algae_index)
        forecast['yhat_upper'] = inverse_transform_feature(scaler, forecast['yhat_upper'].values, algae_index)
        
        # Evaluate the forecast
        test_forecast = forecast.iloc[-len(test_df):]
        y_true = df['algae'].iloc[-len(test_df):].values
        y_pred = test_forecast['yhat'].values
        
        rmse, mae = evaluate_forecast(y_true, y_pred)
        logging.info(f"\nForecast Accuracy:")
        logging.info(f"RMSE: {rmse:.4f}")
        logging.info(f"MAE: {mae:.4f}")
        
        logging.info("\nAlgae bloom forecast for the next 30 days:")
        logging.info(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(30))
        
        # Plot the forecast
        original_algae = df['algae']
        plot_forecast(original_algae, forecast, 'Algae Bloom')
        logging.info("Forecast plot saved as 'Algae_Bloom_forecast2.png'")
        
    except Exception as e:
        logging.error(f"An error occurred in main: {str(e)}")
        import traceback
        logging.error(traceback.format_exc())

if __name__ == "__main__":
    main()

2024-08-15 16:56:28,043 - INFO - Successfully read the file


  df = df.fillna(method='ffill')  # 使用前向填充方法填充NaN值
2024-08-15 16:56:31,359 - INFO - Using GPU for training
2024-08-15 16:56:31,440 - DEBUG - input tempfile: /tmp/tmplw2du7zn/o12qc8eo.json
2024-08-15 16:56:31,479 - DEBUG - input tempfile: /tmp/tmplw2du7zn/8p2f0qcq.json
2024-08-15 16:56:31,481 - DEBUG - idx 0
2024-08-15 16:56:31,482 - DEBUG - running CmdStan, num_threads: None
2024-08-15 16:56:31,482 - DEBUG - CmdStan args: ['/root/.conda/envs/hydro/envs/algae/lib/python3.12/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=92705', 'data', 'file=/tmp/tmplw2du7zn/o12qc8eo.json', 'init=/tmp/tmplw2du7zn/8p2f0qcq.json', 'output', 'file=/tmp/tmplw2du7zn/prophet_modell1a5h8o4/prophet_model-20240815165631.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
16:56:31 - cmdstanpy - INFO - Chain [1] start processing
2024-08-15 16:56:31,483 - INFO - Chain [1] start processing
16:56:31 - cmdstanpy - INFO - Chain [1] done processing
2024-08-15 16:56:31,594 - INFO - Chain [1] d

# 使用Prophet预测藻类的爆发

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from prophet import Prophet
import matplotlib.pyplot as plt
from datetime import timedelta
import logging
import torch
import os
from sklearn.metrics import mean_squared_error  # 添加这一行
from sklearn.metrics import mean_absolute_error  # 添加这一行


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def preprocess_data(df):
    df['ds'] = pd.to_datetime(df['date'])
    df.set_index('ds', inplace=True)
    df = df.ffill()
    
    features = ['temp', 'oxygen', 'NH3', 'TP', 'TN', 'algae']
    df_features = df[features]
    
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df_features), columns=features, index=df.index)
    
    # Add lagged temperature
    df_scaled['temp_lag'] = df_scaled['temp'].shift(1)
    df_scaled.dropna(inplace=True)
    
    return df_scaled, features, scaler

def prepare_prophet_data(df):
    prophet_df = df.reset_index()
    prophet_df = prophet_df.rename(columns={'ds': 'ds', 'algae': 'y'})
    return prophet_df

def train_prophet_model(train_df, use_gpu=True):
    if use_gpu and torch.cuda.is_available():
        logging.info("Using GPU for training")
        os.environ['PROPHET_USE_GPU'] = 'true'
    else:
        logging.info("Using CPU for training")
        os.environ['PROPHET_USE_GPU'] = 'false'

    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=True
    )
    
    # Add regressors
    for col in train_df.columns:
        if col not in ['ds', 'y']:
            model.add_regressor(col)
    
    model.fit(train_df)
    
    return model

def forecast_algae(model, future_df):
    forecast = model.predict(future_df)
    return forecast

def inverse_transform_feature(scaler, data, feature_index):
    dummy = np.zeros((len(data), len(scaler.scale_)))
    dummy[:, feature_index] = data
    return scaler.inverse_transform(dummy)[:, feature_index]

def plot_forecast(original_data, forecast, feature_name):
    plt.figure(figsize=(12, 6))
    plt.plot(original_data.index, original_data, label='Observed')
    plt.plot(forecast['ds'], forecast['yhat'], label='Forecast', color='red')
    plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color='red', alpha=0.2)
    
    plt.title(f'{feature_name} Forecast')
    plt.xlabel('Date')
    plt.ylabel(feature_name)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'{feature_name}_forecast2.png')
    plt.close()

def evaluate_forecast(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

def main():
    try:
        df = pd.read_csv('/root/Download/AlgaeBloomForecast/smoothed_data.csv', encoding='utf-8')
        logging.info("Successfully read the file")
        df = df.fillna(method='ffill')  # 使用前向填充方法填充NaN值
        
        df_scaled, features, scaler = preprocess_data(df)
        prophet_df = prepare_prophet_data(df_scaled)
        
        # Split data into train and test sets
        train_size = int(len(prophet_df) * 0.8)
        train_df = prophet_df[:train_size]
        test_df = prophet_df[train_size:]
        
        # Train the model
        model = train_prophet_model(train_df)
        
        # Forecast for the test period
        future_df = model.make_future_dataframe(periods=len(test_df))
        for col in prophet_df.columns:
            if col not in ['ds', 'y']:
                future_df[col] = prophet_df[col].reindex(future_df.index)
        
        forecast = forecast_algae(model, future_df)
        
        # Inverse transform the forecast
        algae_index = features.index('algae')
        forecast['yhat'] = inverse_transform_feature(scaler, forecast['yhat'].values, algae_index)
        forecast['yhat_lower'] = inverse_transform_feature(scaler, forecast['yhat_lower'].values, algae_index)
        forecast['yhat_upper'] = inverse_transform_feature(scaler, forecast['yhat_upper'].values, algae_index)
        
        # Evaluate the forecast
        test_forecast = forecast.iloc[-len(test_df):]
        y_true = df['algae'].iloc[-len(test_df):].values
        y_pred = test_forecast['yhat'].values
        
        rmse, mae = evaluate_forecast(y_true, y_pred)
        logging.info(f"\nForecast Accuracy:")
        logging.info(f"RMSE: {rmse:.4f}")
        logging.info(f"MAE: {mae:.4f}")
        
        logging.info("\nAlgae bloom forecast for the next 30 days:")
        logging.info(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(30))
        
        # Plot the forecast
        original_algae = df['algae']
        plot_forecast(original_algae, forecast, 'Algae Bloom')
        logging.info("Forecast plot saved as 'Algae_Bloom_forecast2.png'")
        
    except Exception as e:
        logging.error(f"An error occurred in main: {str(e)}")
        import traceback
        logging.error(traceback.format_exc())

if __name__ == "__main__":
    main()

2024-08-15 17:01:39,149 - INFO - Successfully read the file
  df = df.fillna(method='ffill')  # 使用前向填充方法填充NaN值
2024-08-15 17:01:39,156 - INFO - Using GPU for training
2024-08-15 17:01:39,173 - DEBUG - input tempfile: /tmp/tmpt9zachbo/tfikytze.json
2024-08-15 17:01:39,199 - DEBUG - input tempfile: /tmp/tmpt9zachbo/_fg2vhr0.json
2024-08-15 17:01:39,200 - DEBUG - idx 0
2024-08-15 17:01:39,200 - DEBUG - running CmdStan, num_threads: None
2024-08-15 17:01:39,201 - DEBUG - CmdStan args: ['/root/.conda/envs/hydro/envs/algae/lib/python3.12/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=69106', 'data', 'file=/tmp/tmpt9zachbo/tfikytze.json', 'init=/tmp/tmpt9zachbo/_fg2vhr0.json', 'output', 'file=/tmp/tmpt9zachbo/prophet_modelwwbjm5n2/prophet_model-20240815170139.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
17:01:39 - cmdstanpy - INFO - Chain [1] start processing
2024-08-15 17:01:39,201 - INFO - Chain [1] start processing
17:01:39 - cmdstanpy - INFO - Chain [1] 