In [None]:
from datetime import datetime
import os

from dataset import LoadSpotDataset
from utils import get_name

# Set up model directory structure

In [None]:
DIR = "_models/" + get_name()
os.makedirs(DIR + "/data", exist_ok=True)

os.system(f"cp config.yaml {DIR}")

In [None]:
lsd = LoadSpotDataset(f"{DIR}/config.yaml")

prices_df, instance_info_df = lsd.load_data()

In [None]:
train_df, val_df, test_df = lsd.get_training_validation_test_split(
    prices_df, train_ratio=0.7, val_ratio=0.15
)

In [None]:
# Save dataframes to model-specific directory
prices_df.to_pickle(f"{DIR}/data/prices_df.pkl")
instance_info_df.to_pickle(f"{DIR}/data/instance_info_df.pkl")

train_df.to_pickle(f"{DIR}/data/train_df.pkl")
val_df.to_pickle(f"{DIR}/data/val_df.pkl")
test_df.to_pickle(f"{DIR}/data/test_df.pkl")

print(f"Data saved to {DIR}")
print(f"Created on {datetime.now()}")

# Information on dataframes

In [None]:
def display_df_stats(df, name):
    """Helper function to display DataFrame statistics"""
    print(f"\n=== {name} Statistics ===")
    print("\nShape:", df.shape)
    print("\nInfo:")
    df.info()
    print("\nSample Data:")
    display(df.head())
    if "price_timestamp" in df.columns:
        start_date = df["price_timestamp"].min()
        end_date = df["price_timestamp"].max()
        days = (end_date - start_date).days
        print(f"\nDate Range: {start_date} to {end_date} ({days} days)")

In [None]:
# Get start and end dates for train_df
train_start_date = train_df["price_timestamp"].min()
train_end_date = train_df["price_timestamp"].max()
train_days = (train_end_date - train_start_date).days

# Get start and end dates for val_df
val_start_date = val_df["price_timestamp"].min()
val_end_date = val_df["price_timestamp"].max()
val_days = (val_end_date - val_start_date).days

# Get start and end dates for test_df
test_start_date = test_df["price_timestamp"].min()
test_end_date = test_df["price_timestamp"].max()
test_days = (test_end_date - test_start_date).days

print(
    f"Train DataFrame: Start Date = {train_start_date}, End Date = {train_end_date}, Number of Days = {train_days}"
)
print(
    f"Validation DataFrame: Start Date = {val_start_date}, End Date = {val_end_date}, Number of Days = {val_days}"
)
print(
    f"Test DataFrame: Start Date = {test_start_date}, End Date = {test_end_date}, Number of Days = {test_days}"
)

## All prices dataframe

In [None]:
display_df_stats(prices_df, "Prices DataFrame")

## Instance info dataframe

In [None]:
display_df_stats(instance_info_df, "Instance Info DataFrame")

## Training dataframe

In [None]:
display_df_stats(train_df, "Training Set")

## Validation dataframe

In [None]:
display_df_stats(val_df, "Validation Set")

## Test dataframe

In [None]:
display_df_stats(test_df, "Test Set")