In [44]:
import datetime as dt

start_time = dt.datetime.now(tz=dt.UTC)
MAX_SECONDS = 60 * 10 # aim for 10 minutes run time

In [45]:
# Comment these lines out when developing locally
# ! pip install ephem 'flaml[automl]' # TODO uncomment
# %cd /kaggle/working # TODO uncomment

In [46]:
import math
from pathlib import Path

import ephem
import kagglehub
import polars as pl
from kagglehub.config import DEFAULT_CACHE_FOLDER

In [47]:
class SunPosition:
    def __init__(self, *, latitude: float, longitude: float) -> None:
        self.latitude = latitude
        self.longitude = longitude
        self._observer = self._create_ephem_observer()
        self._sun = ephem.Sun()

    def _create_ephem_observer(self) -> ephem.Observer:
        observer = ephem.Observer()
        observer.lat = str(self.latitude)
        observer.lon = str(self.longitude)
        return observer

    def altitude(self, *, timestamp_utc: dt.datetime) -> float:
        self._observer.date = timestamp_utc
        self._sun.compute(self._observer)
        return self._sun.alt

In [48]:
CACHE_DIR = Path(DEFAULT_CACHE_FOLDER) / "competitions" / "hill-of-towie-wind-turbine-power-prediction"


def load_training_dataset(*, force_download: bool = False, just_for_year:int|None=None) -> pl.LazyFrame:
    file_path = kagglehub.competition_download(
        handle="hill-of-towie-wind-turbine-power-prediction",
        path="training_dataset.parquet",
        force_download=force_download,
    )
    if just_for_year is None:
        return pl.scan_parquet(Path(file_path))
    return pl.scan_parquet(Path(file_path)).filter(pl.col("TimeStamp_StartFormat").dt.year() == just_for_year)


def load_submission_dataset(*, force_download: bool = False) -> pl.LazyFrame:
    file_path = kagglehub.competition_download(
        handle="hill-of-towie-wind-turbine-power-prediction",
        path="submission_dataset.parquet",
        force_download=force_download,
    )
    return pl.scan_parquet(Path(file_path))

def filter_is_valid(X: pl.DataFrame, y: pl.Series) -> tuple[pl.DataFrame, pl.Series]:
    y = y.filter(X.select("is_valid").to_series())
    X = X.filter(pl.col("is_valid"))
    return X, y

In [49]:
# just load 2019 because that is more representative of best case wind-up analysis where pre and post are at most 1 year
df_train = load_training_dataset(just_for_year=2019).collect()
df_train.head(5)

TimeStamp_StartFormat,wtc_AcWindSp_mean;1,wtc_AcWindSp_mean;2,wtc_AcWindSp_mean;3,wtc_AcWindSp_mean;4,wtc_AcWindSp_mean;5,wtc_AcWindSp_mean;7,wtc_AcWindSp_min;1,wtc_AcWindSp_min;2,wtc_AcWindSp_min;3,wtc_AcWindSp_min;4,wtc_AcWindSp_min;5,wtc_AcWindSp_min;7,wtc_AcWindSp_max;1,wtc_AcWindSp_max;2,wtc_AcWindSp_max;3,wtc_AcWindSp_max;4,wtc_AcWindSp_max;5,wtc_AcWindSp_max;7,wtc_AcWindSp_stddev;1,wtc_AcWindSp_stddev;2,wtc_AcWindSp_stddev;3,wtc_AcWindSp_stddev;4,wtc_AcWindSp_stddev;5,wtc_AcWindSp_stddev;7,wtc_ScYawPos_mean;1,wtc_ScYawPos_mean;2,wtc_ScYawPos_mean;3,wtc_ScYawPos_mean;4,wtc_ScYawPos_mean;5,wtc_ScYawPos_mean;7,wtc_ScYawPos_min;1,wtc_ScYawPos_min;2,wtc_ScYawPos_min;3,wtc_ScYawPos_min;4,wtc_ScYawPos_min;5,wtc_ScYawPos_min;7,…,wtc_ActPower_max;2,wtc_ActPower_max;3,wtc_ActPower_max;4,wtc_ActPower_max;5,wtc_ActPower_max;7,wtc_ActPower_stddev;1,wtc_ActPower_stddev;2,wtc_ActPower_stddev;3,wtc_ActPower_stddev;4,wtc_ActPower_stddev;5,wtc_ActPower_stddev;7,wtc_AmbieTmp_mean;1,wtc_AmbieTmp_mean;2,wtc_AmbieTmp_mean;3,wtc_AmbieTmp_mean;4,wtc_AmbieTmp_mean;5,wtc_AmbieTmp_mean;7,ShutdownDuration;1,ShutdownDuration;2,ShutdownDuration;3,ShutdownDuration;4,ShutdownDuration;5,ShutdownDuration;7,ERA5_temperature_2m,ERA5_relative_humidity_2m,ERA5_dew_point_2m,ERA5_precipitation,ERA5_surface_pressure,ERA5_cloud_cover,ERA5_wind_speed_10m,ERA5_wind_speed_100m,ERA5_wind_direction_10m,ERA5_wind_direction_100m,ERA5_wind_gusts_10m,id,is_valid,target
"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i16,i16,i16,i16,i16,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i32,bool,f64
2019-01-01 00:00:00 UTC,12.27432,13.16435,11.24778,11.08846,10.63383,13.0237,4.98,7.57,5.28,5.49,4.65,7.78,17.790001,21.66,16.32,18.84,18.0,20.790001,2.353884,2.328966,2.005096,2.536227,2.682431,2.399789,-54.786541,-90.360458,-74.553673,-70.138687,-93.22419,-57.71698,-55.299999,-94.699997,-77.599998,-71.0,-96.5,-59.299999,…,2390.0,2358.0,2358.0,2349.0,2352.0,375.827087,159.997101,403.552887,460.781403,612.429077,297.708893,4.291883,5.0,5.0,4.212133,5.0,4.626184,0,0,0,0,0,0,6.642,70.984154,1.752,0.0,994.82843,70.0,8.448077,13.345411,286.504425,287.891968,18.4,-52560,True,2005.649048
2019-01-01 00:10:00 UTC,10.87804,12.9349,10.46269,9.208033,10.01653,10.79133,6.42,7.78,5.49,4.49,4.65,6.22,15.8,19.16,16.530001,15.17,15.48,16.950001,1.870514,1.893832,2.004121,2.052478,2.116802,2.282843,-56.252041,-91.302856,-77.594261,-69.643448,-93.010674,-55.443501,-58.700001,-92.400002,-80.5,-71.0,-94.099998,-62.200001,…,2355.0,2342.0,2348.0,2343.0,2352.0,415.182098,203.421402,388.686005,493.755707,579.818481,480.481812,4.129384,5.0,5.0,4.129483,5.0,4.255417,0,0,0,0,0,0,11.292,73.076088,6.642,0.0,990.559143,100.0,8.723532,13.800363,244.903778,246.96492,17.6,-52559,True,1789.771973
2019-01-01 00:20:00 UTC,10.57354,9.532747,12.13545,11.79736,10.21164,12.68395,4.65,4.98,6.84,4.81,5.59,7.15,17.059999,15.48,17.27,16.950001,15.8,17.690001,2.043976,1.990858,2.118668,2.224131,1.962167,1.925842,-44.838291,-85.389122,-65.529373,-63.350052,-86.091339,-51.238529,-56.099998,-91.0,-76.0,-69.900002,-93.099998,-52.5,…,2371.0,2349.0,2373.0,2348.0,2363.0,501.897003,538.811584,413.073914,447.610596,541.481079,328.893799,4.001917,5.0,5.0,4.0499,4.939867,4.131166,0,0,0,0,0,0,9.392,74.005676,4.992,0.0,994.148438,100.0,7.300684,11.847363,246.595291,248.198532,13.7,-52558,True,1654.629028
2019-01-01 00:30:00 UTC,13.00446,11.11443,12.33127,12.81789,11.34538,13.63712,8.19,6.42,6.74,8.3,6.84,7.88,18.530001,16.639999,18.209999,17.48,15.69,19.889999,2.0071,1.945408,1.845071,1.657609,1.745914,2.062722,-43.624008,-78.366547,-61.45805,-61.885479,-82.292229,-49.95879,-44.5,-82.800003,-64.199997,-63.900002,-85.199997,-51.599998,…,2353.0,2352.0,2350.0,2353.0,2354.0,238.3564,431.991486,194.204605,196.636093,385.329712,270.309204,4.0258,5.0,5.0,4.0096,4.812817,4.20925,0,0,0,0,0,0,9.392,74.005676,4.992,0.0,994.148438,100.0,7.300684,11.847363,246.595291,248.198532,13.7,-52557,True,2186.006104
2019-01-01 00:40:00 UTC,12.96183,11.96091,12.87781,13.76736,11.48969,13.01157,4.65,5.18,7.57,9.44,6.22,7.67,18.32,20.08,17.690001,17.58,15.8,18.32,2.288595,2.364866,1.80783,1.579915,1.807639,1.847596,-42.132721,-77.372864,-61.73,-60.481689,-80.10173,-50.463692,-47.400002,-81.0,-67.199997,-61.900002,-85.599998,-54.0,…,2348.0,2348.0,2356.0,2353.0,2356.0,311.470398,373.480103,111.772003,22.13958,396.871399,211.948395,4.047367,5.0,5.0,4.0013,4.691733,4.07885,0,0,0,0,0,0,9.342,86.430542,7.192,0.0,994.919617,100.0,5.882176,9.752948,252.181015,254.538727,11.5,-52556,True,2141.480957


In [50]:
df_train.tail(5)

TimeStamp_StartFormat,wtc_AcWindSp_mean;1,wtc_AcWindSp_mean;2,wtc_AcWindSp_mean;3,wtc_AcWindSp_mean;4,wtc_AcWindSp_mean;5,wtc_AcWindSp_mean;7,wtc_AcWindSp_min;1,wtc_AcWindSp_min;2,wtc_AcWindSp_min;3,wtc_AcWindSp_min;4,wtc_AcWindSp_min;5,wtc_AcWindSp_min;7,wtc_AcWindSp_max;1,wtc_AcWindSp_max;2,wtc_AcWindSp_max;3,wtc_AcWindSp_max;4,wtc_AcWindSp_max;5,wtc_AcWindSp_max;7,wtc_AcWindSp_stddev;1,wtc_AcWindSp_stddev;2,wtc_AcWindSp_stddev;3,wtc_AcWindSp_stddev;4,wtc_AcWindSp_stddev;5,wtc_AcWindSp_stddev;7,wtc_ScYawPos_mean;1,wtc_ScYawPos_mean;2,wtc_ScYawPos_mean;3,wtc_ScYawPos_mean;4,wtc_ScYawPos_mean;5,wtc_ScYawPos_mean;7,wtc_ScYawPos_min;1,wtc_ScYawPos_min;2,wtc_ScYawPos_min;3,wtc_ScYawPos_min;4,wtc_ScYawPos_min;5,wtc_ScYawPos_min;7,…,wtc_ActPower_max;2,wtc_ActPower_max;3,wtc_ActPower_max;4,wtc_ActPower_max;5,wtc_ActPower_max;7,wtc_ActPower_stddev;1,wtc_ActPower_stddev;2,wtc_ActPower_stddev;3,wtc_ActPower_stddev;4,wtc_ActPower_stddev;5,wtc_ActPower_stddev;7,wtc_AmbieTmp_mean;1,wtc_AmbieTmp_mean;2,wtc_AmbieTmp_mean;3,wtc_AmbieTmp_mean;4,wtc_AmbieTmp_mean;5,wtc_AmbieTmp_mean;7,ShutdownDuration;1,ShutdownDuration;2,ShutdownDuration;3,ShutdownDuration;4,ShutdownDuration;5,ShutdownDuration;7,ERA5_temperature_2m,ERA5_relative_humidity_2m,ERA5_dew_point_2m,ERA5_precipitation,ERA5_surface_pressure,ERA5_cloud_cover,ERA5_wind_speed_10m,ERA5_wind_speed_100m,ERA5_wind_direction_10m,ERA5_wind_direction_100m,ERA5_wind_gusts_10m,id,is_valid,target
"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i16,i16,i16,i16,i16,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i32,bool,f64
2019-12-31 23:10:00 UTC,8.897216,6.674914,8.790455,9.309954,6.749103,9.335896,4.81,3.26,4.4,5.07,3.59,5.59,12.69,10.07,12.45,13.17,9.75,13.52,1.453008,1.169652,1.494632,1.24644,1.121678,1.443997,-96.119614,-126.805603,-117.9804,-112.195999,-136.195602,-101.531799,-100.699997,-131.5,-121.5,-114.800003,-138.0,-105.900002,…,662.0,1592.0,1676.0,693.0,1658.0,184.697006,75.007767,290.05719,192.467102,112.776604,299.1586,3.516783,3.986933,3.992567,3.62495,3.49205,3.162083,0,0,0,0,0,0,1.942,90.765717,0.592,0.0,1001.884033,69.0,3.088689,6.236986,209.054504,221.099411,5.6,-5,True,964.380127
2019-12-31 23:20:00 UTC,9.303802,7.50387,8.951791,9.97668,8.194743,8.700228,6.32,3.59,4.98,6.94,4.57,4.65,12.21,11.5,12.69,13.64,11.26,13.05,1.038562,1.186081,1.556765,1.226175,1.112155,1.936404,-97.002197,-128.177399,-118.5597,-113.142998,-136.962708,-104.914497,-103.0,-134.100006,-121.400002,-115.199997,-140.600006,-106.0,…,1064.0,1581.0,1884.0,1126.0,1650.0,177.164505,129.734695,367.965393,315.084015,153.462906,411.925598,4.467517,4.278967,4.643017,4.41965,4.003917,3.422133,0,0,0,0,0,0,2.842,63.106548,-3.488,0.0,996.454956,95.0,4.326662,8.902246,213.690094,218.157272,8.5,-4,True,1106.759033
2019-12-31 23:30:00 UTC,8.292965,7.242282,7.387715,8.898215,7.778623,7.802809,4.98,4.98,4.49,5.49,5.18,5.28,10.79,9.54,9.85,11.86,10.55,9.75,0.972607,0.881621,0.918921,0.909908,0.89253,0.779341,-96.173378,-124.966599,-116.936699,-113.882103,-133.844696,-102.936096,-103.0,-134.100006,-121.199997,-117.400002,-140.600006,-107.0,…,628.0,673.0,1258.0,761.0,817.0,76.277786,61.76931,71.848351,117.3293,94.306686,58.581848,4.618834,4.787467,3.607067,4.939967,4.704933,2.973367,0,0,0,0,0,0,2.942,61.226002,-3.798,0.0,996.174988,55.0,4.701064,9.276314,218.088821,221.941223,8.8,-3,True,687.353088
2019-12-31 23:40:00 UTC,9.617872,9.914719,8.692904,10.19371,10.44945,8.951912,6.63,5.9,5.9,7.15,6.11,6.22,12.81,14.24,11.5,12.81,14.0,11.26,0.957572,1.496119,1.032007,0.999657,1.459871,0.746953,-83.940231,-113.025002,-106.912804,-102.196404,-121.0886,-94.316544,-92.199997,-121.099998,-112.699997,-110.900002,-134.5,-97.300003,…,1829.0,1221.0,1689.0,2067.0,1025.0,195.377304,362.835999,158.093002,212.542206,361.223389,63.843021,3.230533,4.2476,3.0,3.471367,4.1088,2.720033,0,0,0,0,0,0,1.942,90.765717,0.592,0.0,1001.884033,69.0,3.088689,6.236986,209.054504,221.099411,5.6,-2,True,1107.057007
2019-12-31 23:50:00 UTC,9.997997,10.40753,9.129176,10.60657,10.89676,9.398952,7.26,5.7,6.11,6.53,5.18,5.28,12.45,13.52,12.1,14.12,14.24,12.33,0.902917,1.177762,0.956576,1.312517,1.392926,1.278592,-77.398903,-104.9664,-102.875099,-96.931488,-112.689903,-90.385643,-83.099998,-110.800003,-107.900002,-99.199997,-115.800003,-98.800003,…,1817.0,1488.0,2084.0,2188.0,1794.0,167.616104,149.195007,215.106796,316.128387,276.866394,330.417511,2.906667,3.281917,3.0,3.090867,3.0883,2.973267,0,0,0,0,0,0,5.292,76.816032,1.552,0.0,1000.020325,89.0,3.008322,5.950631,195.422226,204.842361,4.0,-1,True,1328.432983


In [51]:
# basic definitions of X and y train
X_train = df_train.select(pl.exclude("target"))
y_train = df_train.select("target").to_series()


In [52]:
# remove invalid rows
X_train, y_train = filter_is_valid(X_train, y_train)

In [53]:
# preprocess X
def preprocess_x(X):
    sun_position = SunPosition(latitude=57.50576819514985, longitude=-3.0683841268762757)
    X = X.with_columns(
            pl.col("TimeStamp_StartFormat")
            .map_elements(lambda ts: sun_position.altitude(timestamp_utc=ts), return_dtype=pl.Float64)
            .mul(180 / math.pi)
            .alias("sun_altitude"),
        )
    cols_to_exclude=["id","is_valid","TimeStamp_StartFormat",*[x for x in X.columns if x.split(";")[-1]=="1"]]
    return X.select(pl.exclude(cols_to_exclude))
X_train = preprocess_x(X_train)
print(X_train.columns)

['wtc_AcWindSp_mean;2', 'wtc_AcWindSp_mean;3', 'wtc_AcWindSp_mean;4', 'wtc_AcWindSp_mean;5', 'wtc_AcWindSp_mean;7', 'wtc_AcWindSp_min;2', 'wtc_AcWindSp_min;3', 'wtc_AcWindSp_min;4', 'wtc_AcWindSp_min;5', 'wtc_AcWindSp_min;7', 'wtc_AcWindSp_max;2', 'wtc_AcWindSp_max;3', 'wtc_AcWindSp_max;4', 'wtc_AcWindSp_max;5', 'wtc_AcWindSp_max;7', 'wtc_AcWindSp_stddev;2', 'wtc_AcWindSp_stddev;3', 'wtc_AcWindSp_stddev;4', 'wtc_AcWindSp_stddev;5', 'wtc_AcWindSp_stddev;7', 'wtc_ScYawPos_mean;2', 'wtc_ScYawPos_mean;3', 'wtc_ScYawPos_mean;4', 'wtc_ScYawPos_mean;5', 'wtc_ScYawPos_mean;7', 'wtc_ScYawPos_min;2', 'wtc_ScYawPos_min;3', 'wtc_ScYawPos_min;4', 'wtc_ScYawPos_min;5', 'wtc_ScYawPos_min;7', 'wtc_ScYawPos_max;2', 'wtc_ScYawPos_max;3', 'wtc_ScYawPos_max;4', 'wtc_ScYawPos_max;5', 'wtc_ScYawPos_max;7', 'wtc_ScYawPos_stddev;2', 'wtc_ScYawPos_stddev;3', 'wtc_ScYawPos_stddev;4', 'wtc_ScYawPos_stddev;5', 'wtc_ScYawPos_stddev;7', 'wtc_NacelPos_mean;2', 'wtc_NacelPos_mean;3', 'wtc_NacelPos_mean;4', 'wtc_Nacel

In [54]:
from autogluon.tabular import TabularPredictor
import tempfile

train_data=X_train.with_columns(
    t1_power=y_train
)
X_test = load_submission_dataset().collect()
df_id = X_test.select("id")
X_test = preprocess_x(X_test)

with tempfile.TemporaryDirectory() as model_dir:
    model = TabularPredictor(label="t1_power",problem_type='regression',eval_metric='mae',path=model_dir).fit(train_data.to_pandas(),presets='medium',time_limit=3*60)
    y_test = pl.Series(values=model.predict(X_test.to_pandas())).clip(lower_bound=0)

submission = df_id.with_columns(prediction=y_test)

Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.8
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          12
Memory Avail:       5.94 GB / 31.69 GB (18.7%)
Disk Space Avail:   134.45 GB / 475.70 GB (28.3%)
Presets specified: ['medium']
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ... Time limit = 180s
AutoGluon will save models to "C:\Users\aclerc\AppData\Local\Temp\tmp1ncaaf6y"
Train Data Rows:    50985
Train Data Columns: 157
Label Column:       t1_power
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6067.77 MB
	Train Data (Original)  Memory Usage: 57.47 MB (0.9% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in 

[1000]	valid_set's l1: 48.462
[2000]	valid_set's l1: 47.1359
[3000]	valid_set's l1: 46.5144
[4000]	valid_set's l1: 46.2489
[5000]	valid_set's l1: 46.1082
[6000]	valid_set's l1: 46.0552
[7000]	valid_set's l1: 45.9489
[8000]	valid_set's l1: 45.9255
[9000]	valid_set's l1: 45.8959
[10000]	valid_set's l1: 45.8738


	-45.8723	 = Validation score   (-mean_absolute_error)
	51.27s	 = Training   runtime
	0.69s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 126.33s of the 126.33s of remaining time.
	Fitting with cpus=10, gpus=0, mem=0.3/5.7 GB


[1000]	valid_set's l1: 47.1107
[2000]	valid_set's l1: 46.2193
[3000]	valid_set's l1: 45.7663
[4000]	valid_set's l1: 45.58
[5000]	valid_set's l1: 45.5222
[6000]	valid_set's l1: 45.4754
[7000]	valid_set's l1: 45.4704
[8000]	valid_set's l1: 45.4439
[9000]	valid_set's l1: 45.4115
[10000]	valid_set's l1: 45.3851


	-45.3835	 = Validation score   (-mean_absolute_error)
	114.22s	 = Training   runtime
	0.61s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 10.53s of the 10.53s of remaining time.
	Fitting with cpus=12, gpus=0, mem=0.0/5.7 GB
	-53.0167	 = Validation score   (-mean_absolute_error)
	619.28s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 179.27s of the -609.66s of remaining time.
	Ensemble Weights: {'LightGBM': 0.591, 'LightGBMXT': 0.409}
	-44.3661	 = Validation score   (-mean_absolute_error)
	0.05s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 789.89s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1918.9 rows/s (2500 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("C:\Users\aclerc\AppData\Local\Temp\tmp1ncaaf6y")


In [55]:
# checking the columns are the expected ones
assert submission.columns == ["id", "prediction"], f'Expected columns ["id", "prediction"], found: {submission.columns}'

# checking no nulls in the data
assert submission.select(pl.col("id").is_null().sum()).item() == 0, "There are null values in the 'id' column"
assert submission.select(pl.col("id").is_nan().sum()).item() == 0, "There are nan values in the 'id' column"
assert submission.select(pl.col("prediction").is_null().sum()).item() == 0, (
    "There are null values in the 'prediction' column"
)
assert submission.select(pl.col("prediction").is_nan().sum()).item() == 0, (
    "There are nan values in the 'prediction' column"
)

# checking the row ids are unique and within expected range
duplicated_ids = submission.select("id").is_duplicated()
assert not duplicated_ids.any(), (
    f"There are duplicated ids: {submission.select('id').filter(duplicated_ids).to_series().unique()}"
)
invalid_ids = set(submission.select("id").unique().to_series().to_list()) - set(range(52704))
assert not invalid_ids, f"The following row IDs are not within the expected ones: {invalid_ids}"

print("Submission file is valid and ready for submission.")

submission.write_csv("submission.csv")

Submission file is valid and ready for submission.
