In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Adjust display settings for wider output
pd.set_option('display.width', 300)  # Increase the display width
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 10)  # Limit rows displayed for clarity


prices = pd.read_csv("/Users/olaoluwatunmise/Germany-Energy-Price-Forecast/PowerCast-Datasets/Electricity-Price/Day-ahead_prices_202301010000_202503050000_Hour.csv", delimiter=';')

print(prices.head())

             Start date             End date  Germany/Luxembourg [€/MWh] Original resolutions  ∅ DE/LU neighbours [€/MWh] Original resolutions  Belgium [€/MWh] Original resolutions  Denmark 1 [€/MWh] Original resolutions  Denmark 2 [€/MWh] Original resolutions  France [€/MWh] Original resolutions  \
0  Jan 1, 2023 12:00 AM  Jan 1, 2023 1:00 AM                                            -5.17                                            13.85                                 -4.39                                    2.01                                    2.01                                 0.00   
1   Jan 1, 2023 1:00 AM  Jan 1, 2023 2:00 AM                                            -1.07                                             9.79                                 -1.75                                    1.38                                    1.38                                -0.10   
2   Jan 1, 2023 2:00 AM  Jan 1, 2023 3:00 AM                                            -1.47    

# Exploratory Data Analysis

In [2]:
# Convert Start date to datetime
prices["Start date"] = pd.to_datetime(prices["Start date"], format="%b %d, %Y %I:%M %p")
prices.set_index("Start date", inplace=True)


# Drop the "End date" column since it's not needed for time series analysis
prices.drop(columns=["End date"], inplace=True)

# Convert all remaining columns to numeric (in case any are still strings)
prices = prices.apply(pd.to_numeric, errors='coerce')

# Reset index to make "Start date" a column again
prices.reset_index(inplace=True)

print(prices.head())

           Start date  Germany/Luxembourg [€/MWh] Original resolutions  ∅ DE/LU neighbours [€/MWh] Original resolutions  Belgium [€/MWh] Original resolutions  Denmark 1 [€/MWh] Original resolutions  Denmark 2 [€/MWh] Original resolutions  France [€/MWh] Original resolutions  \
0 2023-01-01 00:00:00                                            -5.17                                            13.85                                 -4.39                                    2.01                                    2.01                                 0.00   
1 2023-01-01 01:00:00                                            -1.07                                             9.79                                 -1.75                                    1.38                                    1.38                                -0.10   
2 2023-01-01 02:00:00                                            -1.47                                             8.91                                 -1.46         

### Clean the column names

In [3]:
# Clean the column names
prices.columns = prices.columns.str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
prices.columns = prices.columns.str.strip()  # Remove leading/trailing spaces
prices.columns = prices.columns.str.replace('Original resolutions', '', regex=False)  # Remove redundant words
# Remove "MWh" from the column names
prices.columns = prices.columns.str.replace(' MWh ', '', regex=False)
# Replace spaces between words with underscores for consistency
prices.columns = prices.columns.str.replace(' ', '_', regex=False)

# Replace multiple consecutive underscores with a single one (in case multiple spaces exist)
prices.columns = prices.columns.str.replace('_+', '_', regex=True)

# Ensure all column names are lowercase (optional, for consistency)
prices.columns = prices.columns.str.lower()


print(prices.columns)
print(prices)

Index(['start_date', 'germanyluxembourg', 'delu_neighbours', 'belgium', 'denmark_1', 'denmark_2', 'france', 'netherlands', 'norway_2', 'austria', 'poland', 'sweden_4', 'switzerland', 'czech_republic', 'deatlu', 'northern_italy', 'slovenia', 'hungary'], dtype='object')
               start_date  germanyluxembourg  delu_neighbours  belgium  denmark_1  denmark_2  france  netherlands  norway_2  austria  poland  sweden_4  switzerland  czech_republic  deatlu  northern_italy  slovenia  hungary
0     2023-01-01 00:00:00              -5.17            13.85    -4.39       2.01       2.01    0.00        -3.61    119.32    12.06   18.09      2.01         0.03            4.84     NaN          195.90     13.31    19.76
1     2023-01-01 01:00:00              -1.07             9.79    -1.75       1.38       1.38   -0.10        -1.46    108.83    -0.10    5.75      1.38        -7.25           -0.35     NaN          191.09     -0.07     0.19
2     2023-01-01 02:00:00              -1.47             8.91 

In [4]:
prices = prices.drop(columns=['denmark_2','deatlu'])

print(prices.columns)
print(prices)

Index(['start_date', 'germanyluxembourg', 'delu_neighbours', 'belgium', 'denmark_1', 'france', 'netherlands', 'norway_2', 'austria', 'poland', 'sweden_4', 'switzerland', 'czech_republic', 'northern_italy', 'slovenia', 'hungary'], dtype='object')
               start_date  germanyluxembourg  delu_neighbours  belgium  denmark_1  france  netherlands  norway_2  austria  poland  sweden_4  switzerland  czech_republic  northern_italy  slovenia  hungary
0     2023-01-01 00:00:00              -5.17            13.85    -4.39       2.01    0.00        -3.61    119.32    12.06   18.09      2.01         0.03            4.84          195.90     13.31    19.76
1     2023-01-01 01:00:00              -1.07             9.79    -1.75       1.38   -0.10        -1.46    108.83    -0.10    5.75      1.38        -7.25           -0.35          191.09     -0.07     0.19
2     2023-01-01 02:00:00              -1.47             8.91    -1.46       0.09   -1.33        -1.52    102.39    -0.66    5.27      0.09   

In [5]:
#Fill with the Column Mean
prices["northern_italy"] = prices["northern_italy"].fillna(prices["northern_italy"].mean())

print(prices.isna().sum())  # Shows count of NaN values per column
print(prices[prices.isna().any(axis=1)])  # Displays rows with NaN values
#prices["northern_italy"] = prices["northern_italy"].fillna(prices["northern_italy"].mean())

print(prices)

start_date           0
germanyluxembourg    0
delu_neighbours      0
belgium              0
denmark_1            0
                    ..
switzerland          0
czech_republic       0
northern_italy       0
slovenia             1
hungary              0
Length: 16, dtype: int64
               start_date  germanyluxembourg  delu_neighbours  belgium  denmark_1  france  netherlands  norway_2  austria  poland  sweden_4  switzerland  czech_republic  northern_italy  slovenia  hungary
14562 2024-08-29 19:00:00             227.49           209.27   164.09     219.84  128.18       219.84     39.01   197.76  407.42    222.03       126.34           350.0           245.0       NaN   736.22
               start_date  germanyluxembourg  delu_neighbours  belgium  denmark_1  france  netherlands  norway_2  austria  poland  sweden_4  switzerland  czech_republic  northern_italy  slovenia  hungary
0     2023-01-01 00:00:00              -5.17            13.85    -4.39       2.01    0.00        -3.61    119.

In [6]:
# Create a copy of the orginal dataframe to preserve it
germany_data = prices[['start_date', 'germanyluxembourg']].copy()

# Rename the column 
germany_data = germany_data.rename(columns={"germanyluxembourg": "germany"})

germany_prices = germany_data[["start_date", "germany"]]

print(germany_prices.columns)
print(germany_prices)

Index(['start_date', 'germany'], dtype='object')
               start_date  germany
0     2023-01-01 00:00:00    -5.17
1     2023-01-01 01:00:00    -1.07
2     2023-01-01 02:00:00    -1.47
3     2023-01-01 03:00:00    -5.08
4     2023-01-01 04:00:00    -4.49
...                   ...      ...
19051 2025-03-04 19:00:00   163.67
19052 2025-03-04 20:00:00   134.13
19053 2025-03-04 21:00:00   114.38
19054 2025-03-04 22:00:00   114.41
19055 2025-03-04 23:00:00   106.81

[19056 rows x 2 columns]
