# Data Preparation

## Imports

In [45]:
import pandas as pd
import numpy as np
import ast
import pytz
import re
import matplotlib.pyplot as plt
import xgboost as xgb 
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")


## Read files and concatenate ID orderbooks

In [6]:
# 01.01.2024 00:00 - 31.12.2024 23:00 (CET)
da_prices_df = pd.read_parquet('Data/da_prices_2024.parquet')

In [7]:
# 01.01.2024 01:00 - 01.04.2024 21:00 (UCT)
orderbook_q1 = pd.read_parquet('Data/orderbook_q1.parquet')

# 01.04.2024 00:00 - 01.07.2024 21:00 (UCT)
orderbook_q2 = pd.read_parquet('Data/orderbook_q2.parquet')

# 01.07.2024 00:00 - 01.10.2024 21:00 (UCT)
orderbook_q3 = pd.read_parquet('Data/orderbook_q3.parquet')

# 01.10.2024 00:00 - 01.01.2025 22:00 (UCT)
orderbook_q4 = pd.read_parquet('Data/orderbook_q4.parquet')

In [8]:
orderbook_df = pd.concat([orderbook_q1, orderbook_q2, orderbook_q3, orderbook_q4])
orderbook_df.to_parquet('Data/orderbook.parquet')

In [16]:
orderbook_df.dtypes

contractId        object
contractName      object
dlvryStart        object
dlvryEnd          object
dlvryAreaId        int64
marketId          object
area              object
recorded          object
time              object
orderId_bid       object
orderId_ask       object
orderPrice_bid    object
orderPrice_ask    object
orderQty_bid      object
orderQty_ask      object
dtype: object

In [17]:
da_prices_df.dtypes

Delivery Start (CET)     object
Delivery End (CET)       object
NO3 Price (EUR)         float64
dtype: object

## Convert columns to datetime

In [9]:
# The coulumns dlvryStart, dlvryEnd and time are in string format. We need to convert them to datetime format.|
orderbook_df['dlvryStart'] = pd.to_datetime(orderbook_df['dlvryStart'])
orderbook_df['dlvryEnd'] = pd.to_datetime(orderbook_df['dlvryEnd'])
orderbook_df['time'] = pd.to_datetime(orderbook_df['time'])

In [10]:
# Convert string columns to datetime objects (without timezone info initially)
da_prices_df['Delivery Start (CET)'] = pd.to_datetime(da_prices_df['Delivery Start (CET)'], format="%d.%m.%Y %H:%M:%S")
da_prices_df['Delivery End (CET)'] = pd.to_datetime(da_prices_df['Delivery End (CET)'], format="%d.%m.%Y %H:%M:%S")

# Localize to CET timezone
cet = pytz.timezone('CET')
da_prices_df['Delivery Start (CET)'] = da_prices_df['Delivery Start (CET)'].dt.tz_localize(cet, ambiguous='infer')
da_prices_df['Delivery End (CET)'] = da_prices_df['Delivery End (CET)'].dt.tz_localize(cet, ambiguous='infer')

# Convert from CET to UCT
da_prices_df['Delivery Start (UTC)'] = da_prices_df['Delivery Start (CET)'].dt.tz_convert('UTC')
da_prices_df['Delivery End (UTC)'] = da_prices_df['Delivery End (CET)'].dt.tz_convert('UTC')

In [11]:
da_prices_df.dtypes

Delivery Start (CET)    datetime64[ns, CET]
Delivery End (CET)      datetime64[ns, CET]
NO3 Price (EUR)                     float64
Delivery Start (UTC)    datetime64[ns, UTC]
Delivery End (UTC)      datetime64[ns, UTC]
dtype: object

In [11]:
# Remove the CET columns
da_prices_df = da_prices_df.drop(columns=['Delivery Start (CET)', 'Delivery End (CET)'])

In [13]:
orderbook_df

Unnamed: 0,contractId,contractName,dlvryStart,dlvryEnd,dlvryAreaId,marketId,area,recorded,time,orderId_bid,orderId_ask,orderPrice_bid,orderPrice_ask,orderQty_bid,orderQty_ask
0,NX_426231,PH-20240101-03,2024-01-01 01:00:00+00:00,2024-01-01 02:00:00+00:00,10,N_2,NO3,2024-03-06 10:24:54.175 +0000,2023-12-31 23:00:00+00:00,"{'X5385339387','X5385339295','X5385350510','X5...","{'X5385371204','X5385374175','X5385353666','X5...","{2247,2100,2000,2000,1751,1443,1212,1205,1205,...","{2350,2378,2409,2497,2514,2559,2590,2600,2657,...","{500,5000,800,1600,1000,700,5000,5200,800,2200}","{10000,2000,5000,1000,200,500,300,10000,4000,500}"
1,NX_426238,PH-20240101-04,2024-01-01 02:00:00+00:00,2024-01-01 03:00:00+00:00,10,N_2,NO3,2024-03-06 10:24:56.947 +0000,2023-12-31 23:00:00+00:00,"{'X5385372330','X5385372495','X5385371311','X5...","{'X5385372323','X5385372333','X5385373948','X5...","{2061,2061,2051,2000,2000,2000,1920,1467,1000,...","{2251,2251,2262,2299,2300,2433,2447,2448,2551,...","{1900,3000,100,5000,800,1500,400,500,1000,1000}","{200,500,200,3000,10000,500,3600,15000,4000,4000}"
2,NX_426245,PH-20240101-05,2024-01-01 03:00:00+00:00,2024-01-01 04:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:00.755 +0000,2023-12-31 23:00:00+00:00,"{'X5385359141','X5385367235','X5385367212','X5...","{'X5385364822','X5385364828','X5385364821','X5...","{2000,1950,1927,1917,1916,1320,1319,1319,1319,...","{2127,2127,2129,2150,2200,2400,2400,2401,2401,...","{100,100,1200,9000,2000,4500,1600,500,1000,1000}","{200,5000,400,10000,2000,4000,5000,500,15000,400}"
3,NX_426252,PH-20240101-06,2024-01-01 04:00:00+00:00,2024-01-01 05:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:06.307 +0000,2023-12-31 23:00:00+00:00,"{'X5385371584','X5385371589','X5385371590','X5...","{'X5385363190','X5385363118','X5383614807','X5...","{1574,1574,1574,1574,1574,1574,1424,1423,1000,...","{1798,1800,1875,2050,2123,2423,2504,2550,2670,...","{1500,5000,1000,2000,1000,5000,2000,5000,1000,...","{10000,10000,10000,5000,15000,1000,1900,10000,..."
4,NX_426260,PH-20240101-07,2024-01-01 05:00:00+00:00,2024-01-01 06:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:10.161 +0000,2023-12-31 23:00:00+00:00,"{'X5384463871','X5385167819','X5385169379','X5...","{'X5385370820','X5385371596','X5385371593','X5...","{1748,1748,1748,1748,1746,1520,1500,1500,1362,...","{2039,2089,2090,2262,2462,2562,2670,2700,2710,...","{4900,1000,1000,500,5000,1000,2000,1300,5000,5...","{500,1000,10000,15000,10000,1000,5000,5000,100..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2717815,NX_496910,PH-20250101-20,2025-01-01 18:00:00+00:00,2025-01-01 19:00:00+00:00,10,N_2,NO3,2024-12-31 23:43:44.288 +0000,2024-12-31 22:59:00+00:00,"{'X10233448499','X10234051602','X10234052934',...","{'X10232980063','X10228851570','X10232667440',...","{980,828,828,809,743,721,721,658,575,545}","{2775,2780,2780,2810,2810,2870,2900,3500}","{3900,4000,5000,5000,5000,5000,4000,800,2000,200}","{200,8000,5000,5000,5000,10000,2700,10000}"
2717816,NX_496918,PH-20250101-21,2025-01-01 19:00:00+00:00,2025-01-01 20:00:00+00:00,10,N_2,NO3,2024-12-31 23:43:48.436 +0000,2024-12-31 22:59:00+00:00,"{'X10233744528','X10233744531','X10233744825',...","{'X10228851573','X10229998186','X10229998185',...","{513,513,509,500,498,498,497,394,377,376}","{2780,2780,2810,2810,2820,2870,2900,2900,2950,...","{5000,5000,5000,3900,5000,5000,300,2500,5000,6...","{6400,5000,5000,5000,10000,10000,1900,5000,700..."
2717817,NX_496925,PH-20250101-22,2025-01-01 20:00:00+00:00,2025-01-01 21:00:00+00:00,10,N_2,NO3,2024-12-31 23:43:52.589 +0000,2024-12-31 22:59:00+00:00,"{'X10234238392','X10234238451','X10234238459',...","{'X10228851577','X10229959084','X10225892381',...","{645,645,645,645,645,616,616,574,509,505}","{2780,2780,2800,2810,2810,2820,2870,2900,2900,...","{100,5000,5000,5000,3700,5000,5000,5000,5000,3...","{5400,5000,1000,5000,5000,10000,10000,5000,700..."
2717818,NX_496932,PH-20250101-23,2025-01-01 21:00:00+00:00,2025-01-01 22:00:00+00:00,10,N_2,NO3,2024-12-31 23:43:58.555 +0000,2024-12-31 22:59:00+00:00,"{'X10233525336','X10233525816','X10225075176',...","{'X10234235630','X10234235128','X10234235142',...","{305,305,300,190,-70,-1000}","{950,990,990,990,1035,2780,2780,2800,2810,2810}","{4000,1500,5000,5000,10000,10000}","{500,5300,1600,5000,1000,8700,5000,10000,5000,..."


## Merge ID orderbook with DA prices

In [12]:
# Merge the ID orderbook with the DA prices.
df = pd.merge(orderbook_df, da_prices_df, left_on='dlvryStart', right_on='Delivery Start (UTC)', how='left')

In [17]:
df

Unnamed: 0,contractId,contractName,dlvryStart,dlvryEnd,dlvryAreaId,marketId,area,recorded,time,orderId_bid,orderId_ask,orderPrice_bid,orderPrice_ask,orderQty_bid,orderQty_ask,NO3 Price (EUR),Delivery Start (UTC),Delivery End (UTC)
0,NX_426231,PH-20240101-03,2024-01-01 01:00:00+00:00,2024-01-01 02:00:00+00:00,10,N_2,NO3,2024-03-06 10:24:54.175 +0000,2023-12-31 23:00:00+00:00,"{'X5385339387','X5385339295','X5385350510','X5...","{'X5385371204','X5385374175','X5385353666','X5...","{2247,2100,2000,2000,1751,1443,1212,1205,1205,...","{2350,2378,2409,2497,2514,2559,2590,2600,2657,...","{500,5000,800,1600,1000,700,5000,5200,800,2200}","{10000,2000,5000,1000,200,500,300,10000,4000,500}",26.66,2024-01-01 01:00:00+00:00,2024-01-01 02:00:00+00:00
1,NX_426238,PH-20240101-04,2024-01-01 02:00:00+00:00,2024-01-01 03:00:00+00:00,10,N_2,NO3,2024-03-06 10:24:56.947 +0000,2023-12-31 23:00:00+00:00,"{'X5385372330','X5385372495','X5385371311','X5...","{'X5385372323','X5385372333','X5385373948','X5...","{2061,2061,2051,2000,2000,2000,1920,1467,1000,...","{2251,2251,2262,2299,2300,2433,2447,2448,2551,...","{1900,3000,100,5000,800,1500,400,500,1000,1000}","{200,500,200,3000,10000,500,3600,15000,4000,4000}",24.48,2024-01-01 02:00:00+00:00,2024-01-01 03:00:00+00:00
2,NX_426245,PH-20240101-05,2024-01-01 03:00:00+00:00,2024-01-01 04:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:00.755 +0000,2023-12-31 23:00:00+00:00,"{'X5385359141','X5385367235','X5385367212','X5...","{'X5385364822','X5385364828','X5385364821','X5...","{2000,1950,1927,1917,1916,1320,1319,1319,1319,...","{2127,2127,2129,2150,2200,2400,2400,2401,2401,...","{100,100,1200,9000,2000,4500,1600,500,1000,1000}","{200,5000,400,10000,2000,4000,5000,500,15000,400}",24.01,2024-01-01 03:00:00+00:00,2024-01-01 04:00:00+00:00
3,NX_426252,PH-20240101-06,2024-01-01 04:00:00+00:00,2024-01-01 05:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:06.307 +0000,2023-12-31 23:00:00+00:00,"{'X5385371584','X5385371589','X5385371590','X5...","{'X5385363190','X5385363118','X5383614807','X5...","{1574,1574,1574,1574,1574,1574,1424,1423,1000,...","{1798,1800,1875,2050,2123,2423,2504,2550,2670,...","{1500,5000,1000,2000,1000,5000,2000,5000,1000,...","{10000,10000,10000,5000,15000,1000,1900,10000,...",21.23,2024-01-01 04:00:00+00:00,2024-01-01 05:00:00+00:00
4,NX_426260,PH-20240101-07,2024-01-01 05:00:00+00:00,2024-01-01 06:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:10.161 +0000,2023-12-31 23:00:00+00:00,"{'X5384463871','X5385167819','X5385169379','X5...","{'X5385370820','X5385371596','X5385371593','X5...","{1748,1748,1748,1748,1746,1520,1500,1500,1362,...","{2039,2089,2090,2262,2462,2562,2670,2700,2710,...","{4900,1000,1000,500,5000,1000,2000,1300,5000,5...","{500,1000,10000,15000,10000,1000,5000,5000,100...",22.62,2024-01-01 05:00:00+00:00,2024-01-01 06:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10765075,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:55:00+00:00,"{'X10224735588','X10231756122','X10226066681',...","{'X10226066683','X10232275812','X10223438983',...","{1262,1100,885,885,650,640,586,371,370,340}","{1685,1889,1973,2260,2310,2535,2535,2535,2700,...","{4300,6900,5200,1700,1000,200,10000,100,5300,200}","{1500,200,300,2000,200,5000,5000,5000,5000,5000}",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00
10765100,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:56:00+00:00,"{'X10224735588','X10231756122','X10226066681',...","{'X10226066683','X10232275812','X10223438983',...","{1262,1100,885,885,690,680,586,371,370,344}","{1685,1889,1973,2250,2270,2531,2531,2531,2700,...","{4300,6900,5200,1700,1000,200,10000,100,5300,1...","{1500,200,300,2000,200,5000,5000,5000,5000,5000}",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00
10765125,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:57:00+00:00,"{'X10224735588','X10231756122','X10226066681',...","{'X10226066683','X10232275812','X10223438983',...","{1262,1100,885,885,690,680,586,381,380,380}","{1685,1889,1973,2250,2270,2531,2531,2531,2700,...","{4300,6900,5200,1700,1000,200,10000,2400,200,800}","{1500,200,300,2000,200,5000,5000,5000,5000,5000}",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00
10765150,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:58:00+00:00,"{'X10232329532','X10231756122','X10226066681',...","{'X10232331930','X10232326949','X10226066683',...","{1101,1100,885,885,690,680,586,381,380,380}","{1200,1300,1685,1889,1973,2250,2270,2531,2531,...","{400,6900,5200,1700,1000,200,10000,400,200,800}","{10700,10000,1500,200,300,2000,200,5000,5000,5...",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00


## Remove rows where DA price is NaN
DA prices go a little bit further than the orderbook. Remove these

In [13]:
df = df.dropna(subset=['NO3 Price (EUR)'])

## Write merged orderbook to parquet

In [14]:
df.to_parquet('Data/merged_orderbook.parquet')

## Start here

In [3]:
df = pd.read_parquet('Data/merged_orderbook.parquet')

In [19]:
df

Unnamed: 0,contractId,contractName,dlvryStart,dlvryEnd,dlvryAreaId,marketId,area,recorded,time,orderId_bid,orderId_ask,orderPrice_bid,orderPrice_ask,orderQty_bid,orderQty_ask,NO3 Price (EUR),Delivery Start (UTC),Delivery End (UTC)
0,NX_426231,PH-20240101-03,2024-01-01 01:00:00+00:00,2024-01-01 02:00:00+00:00,10,N_2,NO3,2024-03-06 10:24:54.175 +0000,2023-12-31 23:00:00+00:00,"{'X5385339387','X5385339295','X5385350510','X5...","{'X5385371204','X5385374175','X5385353666','X5...","{2247,2100,2000,2000,1751,1443,1212,1205,1205,...","{2350,2378,2409,2497,2514,2559,2590,2600,2657,...","{500,5000,800,1600,1000,700,5000,5200,800,2200}","{10000,2000,5000,1000,200,500,300,10000,4000,500}",26.66,2024-01-01 01:00:00+00:00,2024-01-01 02:00:00+00:00
1,NX_426238,PH-20240101-04,2024-01-01 02:00:00+00:00,2024-01-01 03:00:00+00:00,10,N_2,NO3,2024-03-06 10:24:56.947 +0000,2023-12-31 23:00:00+00:00,"{'X5385372330','X5385372495','X5385371311','X5...","{'X5385372323','X5385372333','X5385373948','X5...","{2061,2061,2051,2000,2000,2000,1920,1467,1000,...","{2251,2251,2262,2299,2300,2433,2447,2448,2551,...","{1900,3000,100,5000,800,1500,400,500,1000,1000}","{200,500,200,3000,10000,500,3600,15000,4000,4000}",24.48,2024-01-01 02:00:00+00:00,2024-01-01 03:00:00+00:00
2,NX_426245,PH-20240101-05,2024-01-01 03:00:00+00:00,2024-01-01 04:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:00.755 +0000,2023-12-31 23:00:00+00:00,"{'X5385359141','X5385367235','X5385367212','X5...","{'X5385364822','X5385364828','X5385364821','X5...","{2000,1950,1927,1917,1916,1320,1319,1319,1319,...","{2127,2127,2129,2150,2200,2400,2400,2401,2401,...","{100,100,1200,9000,2000,4500,1600,500,1000,1000}","{200,5000,400,10000,2000,4000,5000,500,15000,400}",24.01,2024-01-01 03:00:00+00:00,2024-01-01 04:00:00+00:00
3,NX_426252,PH-20240101-06,2024-01-01 04:00:00+00:00,2024-01-01 05:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:06.307 +0000,2023-12-31 23:00:00+00:00,"{'X5385371584','X5385371589','X5385371590','X5...","{'X5385363190','X5385363118','X5383614807','X5...","{1574,1574,1574,1574,1574,1574,1424,1423,1000,...","{1798,1800,1875,2050,2123,2423,2504,2550,2670,...","{1500,5000,1000,2000,1000,5000,2000,5000,1000,...","{10000,10000,10000,5000,15000,1000,1900,10000,...",21.23,2024-01-01 04:00:00+00:00,2024-01-01 05:00:00+00:00
4,NX_426260,PH-20240101-07,2024-01-01 05:00:00+00:00,2024-01-01 06:00:00+00:00,10,N_2,NO3,2024-03-06 10:25:10.161 +0000,2023-12-31 23:00:00+00:00,"{'X5384463871','X5385167819','X5385169379','X5...","{'X5385370820','X5385371596','X5385371593','X5...","{1748,1748,1748,1748,1746,1520,1500,1500,1362,...","{2039,2089,2090,2262,2462,2562,2670,2700,2710,...","{4900,1000,1000,500,5000,1000,2000,1300,5000,5...","{500,1000,10000,15000,10000,1000,5000,5000,100...",22.62,2024-01-01 05:00:00+00:00,2024-01-01 06:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10765075,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:55:00+00:00,"{'X10224735588','X10231756122','X10226066681',...","{'X10226066683','X10232275812','X10223438983',...","{1262,1100,885,885,650,640,586,371,370,340}","{1685,1889,1973,2260,2310,2535,2535,2535,2700,...","{4300,6900,5200,1700,1000,200,10000,100,5300,200}","{1500,200,300,2000,200,5000,5000,5000,5000,5000}",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00
10765100,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:56:00+00:00,"{'X10224735588','X10231756122','X10226066681',...","{'X10226066683','X10232275812','X10223438983',...","{1262,1100,885,885,690,680,586,371,370,344}","{1685,1889,1973,2250,2270,2531,2531,2531,2700,...","{4300,6900,5200,1700,1000,200,10000,100,5300,1...","{1500,200,300,2000,200,5000,5000,5000,5000,5000}",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00
10765125,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:57:00+00:00,"{'X10224735588','X10231756122','X10226066681',...","{'X10226066683','X10232275812','X10223438983',...","{1262,1100,885,885,690,680,586,381,380,380}","{1685,1889,1973,2250,2270,2531,2531,2531,2700,...","{4300,6900,5200,1700,1000,200,10000,2400,200,800}","{1500,200,300,2000,200,5000,5000,5000,5000,5000}",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00
10765150,NX_496748,PH-20241231-24,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00,10,N_2,NO3,2024-12-31 21:39:10.554 +0000,2024-12-31 20:58:00+00:00,"{'X10232329532','X10231756122','X10226066681',...","{'X10232331930','X10232326949','X10226066683',...","{1101,1100,885,885,690,680,586,381,380,380}","{1200,1300,1685,1889,1973,2250,2270,2531,2531,...","{400,6900,5200,1700,1000,200,10000,400,200,800}","{10700,10000,1500,200,300,2000,200,5000,5000,5...",12.85,2024-12-31 22:00:00+00:00,2024-12-31 23:00:00+00:00


## Formatting the DF

In [4]:
# Set the delivery hour to be the index
def setIndex(df):
    df = df.set_index(['dlvryStart', 'time'])
    return df

# Sort by dlvryStart and time. Meaning that we get dlvryStart at e.g. 23 with time=14.01, 14.02 etc before dlvryStart at 00 with time=14.01, 14.02 etc
def sortByDlvryStart(df):
    # Sort by time within each index group
    df = df.sort_values(by=['dlvryStart', 'time'])
    df = df.groupby(df['dlvryStart'], group_keys=False).apply(lambda x: x.sort_values(by='time'))
    return df

# Drop unnecessary columns
def dropColumns(df):
    df = df.drop(columns=['contractId', 'contractName', 'dlvryEnd', 'dlvryAreaId', 'marketId', 'area', 'recorded', 'Delivery Start (UTC)', 'Delivery End (UTC)', 'orderId_bid', 'orderId_ask'])
    return df

# Extract first element in a string. To be used in retrieveBestOffers function
def extractFirstElement(text):
    match = re.search(r'{(\d+)', str(text))
    return float(match.group(1)) if match else 0

# The prices and quantities need to be converted to list to retrieve the first value
def retrieveBestOffers(df):
    columns = ['orderPrice_bid', 'orderQty_bid', 'orderPrice_ask', 'orderQty_ask']
    for col in columns:
        df[col] = df[col].apply(lambda x: extractFirstElement(x))
    return df

# Get ID prices at the same magnitude as DA prices
def fixPriceVolumeMagnitude(df):
    df['orderPrice_bid'] = df['orderPrice_bid'] / 100
    df['orderPrice_ask'] = df['orderPrice_ask'] / 100
    df['orderQty_bid'] = df['orderQty_bid'] / 1000
    df['orderQty_ask'] = df['orderQty_ask'] / 1000
    return df

# Rename column names for better readability
def changeName(df):
    df = df.rename(columns={'NO3 Price (EUR)': 'DA_price'})
    return df

# Split delivery hour and time to month, day, hour etc
def createNewColumns(df):
    df['dlvryMonth'] = df['dlvryStart'].dt.month
    df['dlvryDay'] = df['dlvryStart'].dt.day
    df['dlvryHour'] = df['dlvryStart'].dt.hour
    df['dlvryWeekday'] = df['dlvryStart'].dt.weekday

    df['timeMonth'] = df['time'].dt.month
    df['timeDay'] = df['time'].dt.day
    df['timeHour'] = df['time'].dt.hour
    df['timeWeekday'] = df['time'].dt.weekday
    df['timeMinute'] = df['time'].dt.minute
    return df

def findFutureBestOffers(df):
    # Create columns to hold the target variables
    df['better_bid_later'] = 0
    df['better_ask_later'] = 0
    
    # Group by delivery identifiers
    delivery_groups = df.groupby(['dlvryMonth', 'dlvryDay', 'dlvryHour'])
    
    # Process each delivery hour group
    for _, group in delivery_groups:
        # Calculate cumulative max/min in reverse
        reverse_group = group.iloc[::-1].copy()
        reverse_group['cum_max_bid'] = reverse_group['orderPrice_bid'].replace(0, float('-inf')).cummax()
        reverse_group['cum_min_ask'] = reverse_group['orderPrice_ask'].replace(0, float('inf')).cummin()
        
        # Restore original order
        forward_group = reverse_group.iloc[::-1]
        
        # Set target values (shift to compare with future values)
        valid_bids = forward_group['orderPrice_bid'] > 0
        valid_asks = forward_group['orderPrice_ask'] > 0
        
        # Compare current prices with future best prices
        forward_group.loc[valid_bids, 'better_bid_later'] = (
            forward_group.loc[valid_bids, 'cum_max_bid'].shift(-1) > 
            forward_group.loc[valid_bids, 'orderPrice_bid']
        ).astype(int)
        
        forward_group.loc[valid_asks, 'better_ask_later'] = (
            forward_group.loc[valid_asks, 'cum_min_ask'].shift(-1) <=
            forward_group.loc[valid_asks, 'orderPrice_ask']
        ).astype(int)
        
        # Update the original dataframe
        df.loc[forward_group.index, 'better_bid_later'] = forward_group['better_bid_later']
        df.loc[forward_group.index, 'better_ask_later'] = forward_group['better_ask_later']
    
    ''''
    # Handle NaN values that might have been introduced
    df['better_bid_later'] = df['better_bid_later'].fillna(0).astype(int)
    df['better_ask_later'] = df['better_ask_later'].fillna(0).astype(int)'
    '''
    
    return df


In [5]:
df = createNewColumns(df)
#df = setIndex(df) # Maybe not needed
df = sortByDlvryStart(df)
df = dropColumns(df)
df = retrieveBestOffers(df)
df = fixPriceVolumeMagnitude(df)
df = changeName(df)
df = findFutureBestOffers(df)

In [6]:
df.to_parquet('Data/formatted_orderbook.parquet')

## Start here II
The dataframe is updated based on the formatting functions

In [47]:
df = pd.read_parquet('Data/formatted_orderbook.parquet')

## Train/Test split

In [14]:
df

Unnamed: 0,dlvryStart,time,orderPrice_bid,orderPrice_ask,orderQty_bid,orderQty_ask,DA_price,dlvryMonth,dlvryDay,dlvryHour,dlvryWeekday,timeMonth,timeDay,timeHour,timeWeekday,timeMinute,better_bid_later,better_ask_later
0,2024-01-01 01:00:00+00:00,2023-12-31 23:00:00+00:00,22.47,23.50,0.5,10.0,26.66,1,1,1,0,12,31,23,6,0,1,0
22,2024-01-01 01:00:00+00:00,2023-12-31 23:01:00+00:00,22.47,24.12,0.5,0.5,26.66,1,1,1,0,12,31,23,6,1,1,1
44,2024-01-01 01:00:00+00:00,2023-12-31 23:02:00+00:00,22.47,24.20,0.5,9.1,26.66,1,1,1,0,12,31,23,6,2,1,1
66,2024-01-01 01:00:00+00:00,2023-12-31 23:03:00+00:00,22.47,24.59,0.5,0.3,26.66,1,1,1,0,12,31,23,6,3,1,1
88,2024-01-01 01:00:00+00:00,2023-12-31 23:04:00+00:00,22.47,24.29,0.5,0.6,26.66,1,1,1,0,12,31,23,6,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10765075,2024-12-31 22:00:00+00:00,2024-12-31 20:55:00+00:00,12.62,16.85,4.3,1.5,12.85,12,31,22,1,12,31,20,1,55,0,1
10765100,2024-12-31 22:00:00+00:00,2024-12-31 20:56:00+00:00,12.62,16.85,4.3,1.5,12.85,12,31,22,1,12,31,20,1,56,0,1
10765125,2024-12-31 22:00:00+00:00,2024-12-31 20:57:00+00:00,12.62,16.85,4.3,1.5,12.85,12,31,22,1,12,31,20,1,57,0,1
10765150,2024-12-31 22:00:00+00:00,2024-12-31 20:58:00+00:00,11.01,12.00,0.4,10.7,12.85,12,31,22,1,12,31,20,1,58,0,1


In [48]:
# Just create a copy of the dataframe
df_copy = df.copy()
train, test = train_test_split(df_copy, test_size=0.2, shuffle=False)

In [49]:
# I think it is better to train a separate model for bid and ask
FEATURES_BID = ['orderPrice_bid', 'orderQty_bid', 'orderPrice_ask', 'orderQty_ask', 'DA_price', 'dlvryMonth', 'dlvryDay', 'dlvryHour', 'dlvryWeekday', 'timeMonth', 'timeDay', 'timeHour', 'timeWeekday', 'timeMinute']
FEATURES_ASK = ['orderPrice_bid', 'orderQty_bid', 'orderPrice_ask', 'orderQty_ask', 'DA_price', 'dlvryMonth', 'dlvryDay', 'dlvryHour', 'dlvryWeekday', 'timeMonth', 'timeDay', 'timeHour', 'timeWeekday', 'timeMinute']
TARGET_BID = ['better_bid_later']
TARGET_ASK = ['better_ask_later']

In [50]:
x_train_bid = train[FEATURES_BID]
y_train_bid = train[TARGET_BID]
x_test_bid = test[FEATURES_BID]
y_test_bid = test[TARGET_BID]

x_train_ask = train[FEATURES_ASK]
y_train_ask = train[TARGET_ASK]
x_test_ask = test[FEATURES_ASK]
y_test_ask = test[TARGET_ASK]

## Handling Nan and Inf

In [29]:
# Check and clean for NaN in y_train and y_test
print("NaN values in y_train before cleaning:", y_train.isna().sum().sum())
print("NaN values in y_test before cleaning:", y_test.isna().sum().sum())
print("NaN values in x_train before cleaning:", x_train.isna().sum().sum())
print("NaN values in x_test before cleaning:", x_test.isna().sum().sum())

NaN values in y_train before cleaning: 0
NaN values in y_test before cleaning: 0
NaN values in x_train before cleaning: 0
NaN values in x_test before cleaning: 0


In [26]:
# Ensure there are no inf values left in features
print("Infinite values in x_train:", np.isinf(x_train).sum().sum())
print("Infinite values in x_test:", np.isinf(x_test).sum().sum())

Infinite values in x_train: 0
Infinite values in x_test: 0


## Plot the data
Need to plot the data. Maybe use something like sns.pairplot like in the tennis video. Variables plotted against each other

## Model

In [51]:
# Create model instance
model_bid = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.3, objective='binary:logistic')
# Fit model
model_bid.fit(x_train_bid, y_train_bid)
# Make predictions  
preds_bid = model_bid.predict(x_test_bid)

In [None]:
results_bid = x_test_bid.copy()
results_bid['preds'] = preds_bid
results_bid['actual'] = y_test_bid

In [62]:
results_bid.sample(20)

Unnamed: 0,orderPrice_bid,orderQty_bid,orderPrice_ask,orderQty_ask,DA_price,dlvryMonth,dlvryDay,dlvryHour,dlvryWeekday,timeMonth,timeDay,timeHour,timeWeekday,timeMinute,preds,actual
8816605,0.0,2.0,0.19,3.0,0.7,10,26,22,5,10,26,20,5,49,0,0
9491092,9.11,5.0,148.0,3.0,35.07,11,19,11,1,11,18,17,0,57,1,1
10149408,69.05,2.0,109.9,2.0,109.95,12,11,11,2,12,10,23,1,59,0,1
10005094,5.0,11.6,32.9,1.0,28.42,12,6,20,4,12,6,2,4,42,1,1
9072919,0.0,20.0,1.0,10.0,-0.07,11,5,12,1,11,4,15,0,8,0,0
9628452,13.0,2.0,25.65,0.3,20.68,11,23,20,5,11,23,10,5,44,1,1
9722564,21.8,5.0,39.53,4.0,15.09,11,27,7,2,11,26,15,1,15,1,1
10061230,2.02,5.0,33.0,5.0,28.49,12,8,15,6,12,8,0,6,17,1,1
10304023,4.11,5.0,14.0,22.0,9.04,12,16,21,0,12,16,6,0,22,1,1
10633310,0.0,3.8,12.42,10.0,8.42,12,27,19,4,12,27,12,4,45,0,0


## Precision


In [55]:
# Calculate precision
precision = precision_score(y_test_bid, preds_bid)
print('Precision: %.3f' % precision)

Precision: 0.909


In [None]:
# Calculate other important metrics 

# The ratio of true positives to all actual positives. It answers "Of all actual positive instances, how many did we correctly identify?"
recall = recall_score(y_test_bid, preds_bid)

# The harmonic mean of precision and recall, providing a balance between the two
f1 = f1_score(y_test_bid, preds_bid)

# The ratio of correct predictions to total predictions
accuracy = accuracy_score(y_test_bid, preds_bid)

print('Recall: %.3f' % recall)
print('F1 score: %.3f' % f1)
print('Accuracy: %.3f' % accuracy)

Recall: 0.978
F1 score: 0.942
Accuracy: 0.924


## Feature Importance