# Transformers for multivariate energy forecasting
## Descriptive data exploration
### Part 1 - Daily Data

In [None]:
import pandas as pd
# csv into a dataframe
df = pd.read_csv('../data/european_wholesale_electricity_price_data_daily-5.csv')

#### Data overview

In [3]:
# first few rows of the df to get a sence og how data looks like
print("First few rows of the DataFrame:")
print(df.head())

First few rows of the DataFrame:
   Country ISO3 Code        Date  Price (EUR/MWhe)
0  Austria       AUT  2015-01-01             35.86
1  Austria       AUT  2015-01-02             35.86
2  Austria       AUT  2015-01-03             35.86
3  Austria       AUT  2015-01-04             35.86
4  Austria       AUT  2015-01-05             36.18


#### Data format

In [10]:
# data types
print("Original Data Types:\n", df.dtypes)

Original Data Types:
 Country              object
ISO3 Code            object
Date                 object
Price (EUR/MWhe)    float64
dtype: object


In [12]:
# converting 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# checking the results
print("\nAfter Conversion:\n", df)
print("\nData Types after conversion:\n", df.dtypes)


After Conversion:
        Country ISO3 Code       Date  Price (EUR/MWhe)
0      Austria       AUT 2015-01-01             35.86
1      Austria       AUT 2015-01-02             35.86
2      Austria       AUT 2015-01-03             35.86
3      Austria       AUT 2015-01-04             35.86
4      Austria       AUT 2015-01-05             36.18
...        ...       ...        ...               ...
92052   Sweden       SWE 2024-03-27             47.68
92053   Sweden       SWE 2024-03-28             38.01
92054   Sweden       SWE 2024-03-29             24.80
92055   Sweden       SWE 2024-03-30             40.86
92056   Sweden       SWE 2024-03-31             44.07

[92057 rows x 4 columns]

Data Types after conversion:
 Country                     object
ISO3 Code                   object
Date                datetime64[ns]
Price (EUR/MWhe)           float64
dtype: object


The ns suffix stands for nanoseconds, reflecting the precision of the datetime values. This allows for datetime data to be represented with a precision up to nanoseconds, which is particularly useful for high-frequency data, like financial time series.

#### Number of observations and number of distincs entries

In [6]:
# count rows in the whole dataset
row_count = len(df)
print("Number of rows:", row_count)

Number of rows: 92057


In [7]:
# count distinct countries
distinct_names = df['Country'].nunique()
print("Number of distinct names:", distinct_names)

Number of distinct names: 29


In [8]:
# Count distinct dates
distinct_names = df['Date'].nunique()
print("Number of distinct names:", distinct_names)

Number of distinct names: 3378


In [14]:
# Extract the year from the date column
df['year'] = df['Date'].dt.year

# Get the unique years as a list
distinct_years = df['year'].unique().tolist()
distinct_years

[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

#### Price

In [4]:
# summary stats
print("\nSummary statistics for numeric column price:")
print(df.describe())


Summary statistics for numeric column price:
       Price (EUR/MWhe)
count      92057.000000
mean          75.255330
std           77.895849
min         -202.290000
25%           34.980000
50%           48.160000
75%           79.790000
max          888.090000


#### Missing values

In [5]:
#missing values
print("\nCheck for missing values:")
print(df.isnull().sum())


Check for missing values:
Country             0
ISO3 Code           0
Date                0
Price (EUR/MWhe)    0
dtype: int64
