In [1]:
!pip uninstall -y numpy scipy autogluon autogluon.timeseries
!pip install numpy==1.23.5 scipy==1.9.3
!pip install autogluon.timeseries

Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Successfully uninstalled numpy-1.23.5
Found existing installation: scipy 1.9.3
Uninstalling scipy-1.9.3:
  Successfully uninstalled scipy-1.9.3
[0mCollecting numpy==1.23.5
  Using cached numpy-1.23.5-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.3 kB)
Collecting scipy==1.9.3
  Using cached scipy-1.9.3-cp311-cp311-macosx_12_0_arm64.whl.metadata (53 kB)
Using cached numpy-1.23.5-cp311-cp311-macosx_11_0_arm64.whl (13.3 MB)
Using cached scipy-1.9.3-cp311-cp311-macosx_12_0_arm64.whl (28.4 MB)
Installing collected packages: numpy, scipy
Successfully installed numpy-1.23.5 scipy-1.9.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting autogluon.timeseries
  Using cached autogluon.timeseries-1.1.1-py3-none-any.whl.met

In [30]:
import pandas as pd
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

# Load your preprocessed dataset
file_path = 'prediction_dataset.csv'  # Replace with your actual path
df = pd.read_csv(file_path)

# Make an id column (ascending integer) and an item_id column (item 1 for all rows)
df['id'] = range(1, len(df) + 1)
df['item_id'] = 1

# Rename spot_price to target
df.rename(columns={'spot_price': 'target'}, inplace=True)

# Convert datetime_utc to datetime64 dtype and rename to timestamp
df['timestamp'] = pd.to_datetime(df['datetime_utc'], format='ISO8601', errors='coerce')
df = df.drop('datetime_utc', axis=1)

# Check for any NaT values after conversion
if df['timestamp'].isna().any():
    print("Warning: Some datetime values could not be parsed. Please check your data.")
    print(f"Number of NaT values: {df['timestamp'].isna().sum()}")

# Convert to UTC and remove timezone information
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC').dt.tz_localize(None)

# Print dtype information for debugging
print(f"Timestamp column dtype: {df['timestamp'].dtype}")
print(f"Sample values:\n{df['timestamp'].head()}")

# Ensure timestamp column is of dtype datetime64
if not pd.api.types.is_datetime64_dtype(df['timestamp']):
    raise ValueError("The 'timestamp' column must be of dtype datetime64.")


# Convert to TimeSeriesDataFrame
df_tsd = TimeSeriesDataFrame.from_data_frame(
    df,
    id_column='id',
    timestamp_column='timestamp'
)

# Split into training and testing data
train_size = int(len(df_tsd) * 0.8)
split_timestamp = df_tsd.index.get_level_values(1)[train_size]

train_data = df_tsd[df_tsd.index.get_level_values(1) < split_timestamp]
test_data = df_tsd[df_tsd.index.get_level_values(1) >= split_timestamp]

# Setup and train the TimeSeriesPredictor
predictor = TimeSeriesPredictor(
    path='autogluon_checkpoints',  # Directory to store checkpoints
    prediction_length=1,  # Predict 1 day ahead
    eval_metric='RMSE',  # Choose evaluation metric
    freq='D',
)

# Fit the model with time limit and desired presets
predictor.fit(
    train_data=train_data, 
    presets='best_quality',  # Use 'best_quality' for higher accuracy
    time_limit=3600  # Set a time limit of 1 hour
)

# Generate predictions on test data
predictions = predictor.predict(test_data)

# Save predictions
predictions.to_csv('autogluon_predictions.csv')

# Leaderboard of models
leaderboard = predictor.leaderboard()
print(leaderboard)


Renaming existing column 'item_id' -> '__item_id' to avoid name collisions.


Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to 'autogluon_checkpoints'
AutoGluon Version:  1.1.1
Python Version:     3.11.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.0.0: Mon Aug 12 20:51:54 PDT 2024; root:xnu-11215.1.10~2/RELEASE_ARM64_T6000
CPU Count:          10
GPU Count:          0
Memory Avail:       3.61 GB / 16.00 GB (22.5%)
Disk Space Avail:   179.40 GB / 926.35 GB (19.4%)
Setting presets to: best_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': RMSE,
 'freq': 'D',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 1,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'target',
 'time_limit': 3600,
 'verbosity': 2}

train_data with frequency 'None' has been resampled to frequency

Timestamp column dtype: datetime64[ns]
Sample values:
0   2016-01-30
1   2016-01-31
2   2016-02-01
3   2016-02-02
4   2016-02-03
Name: timestamp, dtype: datetime64[ns]


Provided train_data has 766 rows, 766 time series. Median time series length is 1 (min=1, max=1). 
Time series in train_data are too short for chosen num_val_windows=2. Reducing num_val_windows to 1.
	Removing 766 short time series from train_data. Only series with length >= 6 will be used for training.
	After filtering, train_data has 0 rows, 0 time series. Median time series length is nan (min=nan, max=nan). 


ValueError: At least some time series in train_data must have >= 6 observations. Please provide longer time series as train_data or reduce prediction_length, num_val_windows, or val_step_size.