In [12]:
from datetime import datetime

from dataset import LoadSpotDataset

In [13]:
lsd = LoadSpotDataset("config.yaml", "data")

prices_df, instance_info_df = lsd.load_data()

In [14]:
train_df, val_df, test_df = lsd.get_training_validation_test_split(prices_df, train_ratio=0.7, val_ratio=0.15)

In [15]:
prices_df.to_pickle('data/prices_df.pkl')
instance_info_df.to_pickle('data/instance_info_df.pkl')

train_df.to_pickle('data/train_df.pkl')
val_df.to_pickle('data/val_df.pkl')
test_df.to_pickle('data/test_df.pkl')

print(f"Created on {datetime.now()}")

Created on 2025-01-29 11:43:07.172998


In [16]:
def display_df_stats(df, name):
    """Helper function to display DataFrame statistics"""
    print(f"\n=== {name} Statistics ===")
    print("\nShape:", df.shape)
    print("\nInfo:")
    df.info()
    print("\nSample Data:")
    display(df.head())
    if 'price_timestamp' in df.columns:
        start_date = df['price_timestamp'].min()
        end_date = df['price_timestamp'].max()
        days = (end_date - start_date).days
        print(f"\nDate Range: {start_date} to {end_date} ({days} days)")

In [17]:
display_df_stats(prices_df, "Prices DataFrame")


=== Prices DataFrame Statistics ===

Shape: (8001854, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8001854 entries, 0 to 8001853
Data columns (total 3 columns):
 #   Column           Dtype              
---  ------           -----              
 0   price_timestamp  datetime64[ns, UTC]
 1   id_instance      int64              
 2   spot_price       float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 183.1 MB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
0,2024-04-01 00:00:00+00:00,8364,0.0438
1,2024-04-01 00:00:00+00:00,44263,2.2684
2,2024-04-01 00:00:00+00:00,39658,1.0328
3,2024-04-01 00:00:00+00:00,19681,12.5172
4,2024-04-01 00:00:00+00:00,39617,7.8889



Date Range: 2024-04-01 00:00:00+00:00 to 2024-10-20 00:00:00+00:00 (202 days)


In [18]:
display_df_stats(instance_info_df, "Instance Info DataFrame")


=== Instance Info DataFrame Statistics ===

Shape: (6996, 12)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 6996 entries, 48207 to 57869897
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   region               6996 non-null   object 
 1   av_zone              6996 non-null   object 
 2   instance_type        6996 non-null   object 
 3   instance_family      6996 non-null   object 
 4   generation           6996 non-null   int64  
 5   modifiers            6996 non-null   object 
 6   size                 6996 non-null   object 
 7   vcpu                 6996 non-null   int64  
 8   memory               6996 non-null   int64  
 9   architectures        6996 non-null   object 
 10  product_description  6996 non-null   object 
 11  on_demand_price      6984 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 710.5+ KB

Sample Data:


Unnamed: 0_level_0,region,av_zone,instance_type,instance_family,generation,modifiers,size,vcpu,memory,architectures,product_description,on_demand_price
id_instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
48207,us-east-1,b,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
6910,us-east-1,c,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
32082,us-east-1,f,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
39235,us-east-1,a,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224
17992,us-east-1,d,c6a.8xlarge,c,6,[a],8xlarge,32,65536,[x86_64],Linux/UNIX,1.224


In [19]:
display_df_stats(train_df, "Training Set")


=== Training Set Statistics ===

Shape: (5601297, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 5601297 entries, 0 to 5597263
Data columns (total 3 columns):
 #   Column           Dtype              
---  ------           -----              
 0   price_timestamp  datetime64[ns, UTC]
 1   id_instance      int64              
 2   spot_price       float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 170.9 MB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
0,2024-04-01 00:00:00+00:00,8364,0.0438
2487,2024-04-01 00:00:00+00:00,4956,2.0543
2488,2024-04-01 00:00:00+00:00,17748,0.4527
2489,2024-04-01 00:00:00+00:00,51272,0.7348
2490,2024-04-01 00:00:00+00:00,27192,0.4632



Date Range: 2024-04-01 00:00:00+00:00 to 2024-08-21 20:00:00+00:00 (142 days)


In [20]:
display_df_stats(val_df, "Validation Set")


=== Validation Set Statistics ===

Shape: (1200278, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1200278 entries, 5597262 to 6798918
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype              
---  ------           --------------    -----              
 0   price_timestamp  1200278 non-null  datetime64[ns, UTC]
 1   id_instance      1200278 non-null  int64              
 2   spot_price       1200278 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 36.6 MB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
5597262,2024-08-21 20:00:00+00:00,15518,4.0701
5597261,2024-08-21 20:00:00+00:00,28478,9.0278
5597260,2024-08-21 20:00:00+00:00,31944,0.5584
5597259,2024-08-21 20:00:00+00:00,48974,3.4798
5597258,2024-08-21 20:00:00+00:00,16265,0.0539



Date Range: 2024-08-21 20:00:00+00:00 to 2024-09-20 16:00:00+00:00 (29 days)


In [21]:
display_df_stats(test_df, "Test Set")


=== Test Set Statistics ===

Shape: (1200279, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1200279 entries, 6798917 to 8001853
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype              
---  ------           --------------    -----              
 0   price_timestamp  1200279 non-null  datetime64[ns, UTC]
 1   id_instance      1200279 non-null  int64              
 2   spot_price       1200279 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1)
memory usage: 36.6 MB

Sample Data:


Unnamed: 0,price_timestamp,id_instance,spot_price
6798917,2024-09-20 16:00:00+00:00,10179,0.0254
6798916,2024-09-20 16:00:00+00:00,33410,0.1247
6798915,2024-09-20 16:00:00+00:00,28896,3.0282
6798914,2024-09-20 16:00:00+00:00,23516,2.207
6798913,2024-09-20 16:00:00+00:00,33074,1.5232



Date Range: 2024-09-20 16:00:00+00:00 to 2024-10-20 00:00:00+00:00 (29 days)
