In [1]:
from datetime import datetime

from dataset import LoadSpotDataset

# Data loading

In [2]:
lsd = LoadSpotDataset("config.yaml", "data")

prices_df, instance_info_df = lsd.load_data()

In [3]:
# Modifications performed to the dataframes (cuts, etc)

compute_instances = instance_info_df[
    (instance_info_df["instance_family"] == "c")
    & (~instance_info_df["size"].str.contains("metal"))
].index
prices_df = prices_df[prices_df["id_instance"].isin(compute_instances)]
prices_df = prices_df[prices_df["id_instance"] == 48207]

In [4]:
train_df, val_df, test_df = lsd.get_training_validation_test_split(
    prices_df, train_ratio=0.7, val_ratio=0.15
)

In [5]:
prices_df.to_pickle("data/prices_df.pkl")
instance_info_df.to_pickle("data/instance_info_df.pkl")

train_df.to_pickle("data/train_df.pkl")
val_df.to_pickle("data/val_df.pkl")
test_df.to_pickle("data/test_df.pkl")

print(f"Created on {datetime.now()}")

Created on 2025-02-13 16:04:40.143671


# Information on dataframes

In [6]:
def display_df_stats(df, name):
    """Helper function to display DataFrame statistics"""
    print(f"\n=== {name} Statistics ===")
    print("\nShape:", df.shape)
    print("\nInfo:")
    df.info()
    print("\nSample Data:")
    display(df.head())
    if "price_timestamp" in df.columns:
        start_date = df["price_timestamp"].min()
        end_date = df["price_timestamp"].max()
        days = (end_date - start_date).days
        print(f"\nDate Range: {start_date} to {end_date} ({days} days)")

In [7]:
# Get start and end dates for train_df
train_start_date = train_df["price_timestamp"].min()
train_end_date = train_df["price_timestamp"].max()
train_days = (train_end_date - train_start_date).days

# Get start and end dates for val_df
val_start_date = val_df["price_timestamp"].min()
val_end_date = val_df["price_timestamp"].max()
val_days = (val_end_date - val_start_date).days

# Get start and end dates for test_df
test_start_date = test_df["price_timestamp"].min()
test_end_date = test_df["price_timestamp"].max()
test_days = (test_end_date - test_start_date).days

print(
    f"Train DataFrame: Start Date = {train_start_date}, End Date = {train_end_date}, Number of Days = {train_days}"
)
print(
    f"Validation DataFrame: Start Date = {val_start_date}, End Date = {val_end_date}, Number of Days = {val_days}"
)
print(
    f"Test DataFrame: Start Date = {test_start_date}, End Date = {test_end_date}, Number of Days = {test_days}"
)

Train DataFrame: Start Date = 2024-04-01 04:00:00+00:00, End Date = 2024-10-13 12:00:00+00:00, Number of Days = 195
Validation DataFrame: Start Date = 2024-10-13 16:00:00+00:00, End Date = 2024-11-24 08:00:00+00:00, Number of Days = 41
Test DataFrame: Start Date = 2024-11-24 12:00:00+00:00, End Date = 2025-01-06 00:00:00+00:00, Number of Days = 42


## All prices dataframe

In [8]:
display_df_stats(prices_df, "Prices DataFrame")


=== Prices DataFrame Statistics ===

Shape: (1703, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1703 entries, 5803 to 11214247
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   price_timestamp  1703 non-null   datetime64[ns, UTC]
 1   id_instance      1703 non-null   int64              
 2   spot_price       1703 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 53.2 KB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
5803,2024-04-01 04:00:00+00:00,48207,0.6062
11704,2024-04-01 08:00:00+00:00,48207,0.6047
17994,2024-04-01 12:00:00+00:00,48207,0.6061
24456,2024-04-01 16:00:00+00:00,48207,0.6061
30833,2024-04-01 20:00:00+00:00,48207,0.6071



Date Range: 2024-04-01 04:00:00+00:00 to 2025-01-06 00:00:00+00:00 (279 days)


## Instance info dataframe

In [9]:
display_df_stats(instance_info_df, "Instance Info DataFrame")


=== Instance Info DataFrame Statistics ===

Shape: (6996, 12)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 6996 entries, 48207 to 57869897
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   region               6996 non-null   object 
 1   av_zone              6996 non-null   object 
 2   instance_type        6996 non-null   object 
 3   instance_family      6996 non-null   object 
 4   generation           6996 non-null   int64  
 5   modifiers            6996 non-null   object 
 6   size                 6996 non-null   object 
 7   vcpu                 6996 non-null   int64  
 8   memory               6996 non-null   int64  
 9   architectures        6996 non-null   object 
 10  product_description  6996 non-null   object 
 11  on_demand_price      6984 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 710.5+ KB

Sample Data:


Unnamed: 0_level_0,region,av_zone,instance_type,instance_family,generation,modifiers,size,vcpu,memory,architectures,product_description,on_demand_price
id_instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
48207,us-east-1,b,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
6910,us-east-1,c,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
32082,us-east-1,f,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
39235,us-east-1,a,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
17992,us-east-1,d,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224


## Training dataframe

In [10]:
display_df_stats(train_df, "Training Set")


=== Training Set Statistics ===

Shape: (1192, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1192 entries, 5803 to 7732296
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   price_timestamp  1192 non-null   datetime64[ns, UTC]
 1   id_instance      1192 non-null   int64              
 2   spot_price       1192 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 37.2 KB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
5803,2024-04-01 04:00:00+00:00,48207,0.6062
11704,2024-04-01 08:00:00+00:00,48207,0.6047
17994,2024-04-01 12:00:00+00:00,48207,0.6061
24456,2024-04-01 16:00:00+00:00,48207,0.6061
30833,2024-04-01 20:00:00+00:00,48207,0.6071



Date Range: 2024-04-01 04:00:00+00:00 to 2024-10-13 12:00:00+00:00 (195 days)


## Validation dataframe

In [11]:
display_df_stats(val_df, "Validation Set")


=== Validation Set Statistics ===

Shape: (255, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 255 entries, 7739127 to 9439407
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   price_timestamp  255 non-null    datetime64[ns, UTC]
 1   id_instance      255 non-null    int64              
 2   spot_price       255 non-null    float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 8.0 KB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
7739127,2024-10-13 16:00:00+00:00,48207,0.442
7745938,2024-10-13 20:00:00+00:00,48207,0.442
7752748,2024-10-14 00:00:00+00:00,48207,0.4411
7759544,2024-10-14 04:00:00+00:00,48207,0.4374
7766339,2024-10-14 08:00:00+00:00,48207,0.4374



Date Range: 2024-10-13 16:00:00+00:00 to 2024-11-24 08:00:00+00:00 (41 days)


## Test dataframe

In [12]:
display_df_stats(test_df, "Test Set")


=== Test Set Statistics ===

Shape: (256, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, 9446263 to 11214247
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   price_timestamp  256 non-null    datetime64[ns, UTC]
 1   id_instance      256 non-null    int64              
 2   spot_price       256 non-null    float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 8.0 KB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
9446263,2024-11-24 12:00:00+00:00,48207,0.4679
9453196,2024-11-24 16:00:00+00:00,48207,0.4664
9460118,2024-11-24 20:00:00+00:00,48207,0.4664
9467055,2024-11-25 00:00:00+00:00,48207,0.4587
9473891,2024-11-25 04:00:00+00:00,48207,0.4587



Date Range: 2024-11-24 12:00:00+00:00 to 2025-01-06 00:00:00+00:00 (42 days)
