In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
pd.set_option('display.float_format', '{:.0f}'.format)

In [14]:
def get_dataframe():
    label = pd.read_csv('water_temp_dc_2016_2024_hourly.csv')[['t', 'v']]
    weather = pd.read_csv('cln_dc_weather_2016_2024_hourly.csv')
    sun = pd.read_csv('sun_data_2016_2024.csv')

    sun['t'] = pd.to_datetime(sun['datetime'])
    label['t'] = pd.to_datetime(label['t'])
    label['dt_est'] = label['t'].astype('int64') // 1e9
    df = pd.merge(label, weather, on='dt_est', how='inner')
    df = pd.merge(df, sun, on='t', how='inner')
    
    df.index = df['t']
    
    return df


def preprocess(df, features=None):
    df = df[df['v'].notna()]
    if features:
        for f in features:
            df = df[df[f].notna()]
    
    return df

In [15]:
df = get_dataframe()
df = preprocess(df)

sun_df = pd.read_csv('sun_data_2016_2024.csv')

In [16]:
df

Unnamed: 0_level_0,t,v,dt_est,Unnamed: 0,temp,visibility,dew_point,feels_like,temp_min,temp_max,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_main,weather_description,datetime,is_sun_up
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-07 13:00:00,2016-01-07 13:00:00,44,1452171600,162,41,10000,36,38,40,42,...,,,,,,100,Clouds,overcast clouds,2016-01-07 13:00:00,True
2016-01-07 14:00:00,2016-01-07 14:00:00,43,1452175200,163,42,6437,36,42,41,43,...,,,,,,100,Haze,haze,2016-01-07 14:00:00,True
2016-01-07 15:00:00,2016-01-07 15:00:00,43,1452178800,164,42,8047,37,42,42,43,...,,,,,,100,Haze,haze,2016-01-07 15:00:00,True
2016-01-07 16:00:00,2016-01-07 16:00:00,43,1452182400,165,43,10000,38,43,42,44,...,,,,,,100,Clouds,overcast clouds,2016-01-07 16:00:00,True
2016-01-07 17:00:00,2016-01-07 17:00:00,43,1452186000,166,41,10000,35,38,41,43,...,,,,,,100,Clouds,overcast clouds,2016-01-07 17:00:00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 14:00:00,2024-01-01 14:00:00,44,1704117600,73780,41,10000,31,35,40,43,...,,,,,,100,Rain,light rain,2024-01-01 14:00:00,True
2024-01-01 15:00:00,2024-01-01 15:00:00,44,1704121200,73781,40,10000,32,33,39,41,...,0,0,,,,100,Rain,light rain,2024-01-01 15:00:00,True
2024-01-01 16:00:00,2024-01-01 16:00:00,44,1704124800,73782,39,10000,32,32,38,40,...,,0,,,,100,Rain,light rain,2024-01-01 16:00:00,True
2024-01-01 17:00:00,2024-01-01 17:00:00,44,1704128400,73783,39,10000,34,32,38,40,...,,0,,,,100,Rain,light rain,2024-01-01 17:00:00,False


In [89]:
df = get_dataframe()
df = add_prior_features(df, hour_max)

FEATURES = ['month', 'hour', 'day', 'temp_minus_2hr', 'temp_minus_1hr', 'temp', 'humidity', 'clouds_all', 'v_minus_1hr']
TARGET = 'v'

df.index = df['t']
df['month'] = df.index.month
df['hour'] = df.index.hour
df['day'] = df.index.day

df = preprocess(df, features=FEATURES)
train_df, test_df = split_train_test(df)
X_train, y_train, X_test, y_test = split_features_target(train_df, test_df, FEATURES, TARGET)

model = xgb.XGBRegressor()
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-rmse:11.46629	validation_1-rmse:10.69248
[1]	validation_0-rmse:8.03146	validation_1-rmse:7.48975
[2]	validation_0-rmse:5.62648	validation_1-rmse:5.24593
[3]	validation_0-rmse:3.94287	validation_1-rmse:3.67764
[4]	validation_0-rmse:2.76477	validation_1-rmse:2.57846
[5]	validation_0-rmse:1.94112	validation_1-rmse:1.80849
[6]	validation_0-rmse:1.36631	validation_1-rmse:1.27279
[7]	validation_0-rmse:0.96638	validation_1-rmse:0.89999
[8]	validation_0-rmse:0.68987	validation_1-rmse:0.64268
[9]	validation_0-rmse:0.50099	validation_1-rmse:0.46716
[10]	validation_0-rmse:0.37465	validation_1-rmse:0.34946
[11]	validation_0-rmse:0.29299	validation_1-rmse:0.27273
[12]	validation_0-rmse:0.24275	validation_1-rmse:0.22584
[13]	validation_0-rmse:0.21315	validation_1-rmse:0.19820
[14]	validation_0-rmse:0.19674	validation_1-rmse:0.18298
[15]	validation_0-rmse:0.18781	validation_1-rmse:0.17459
[16]	validation_0-rmse:0.18288	validation_1-rmse:0.16993
[17]	validation_0-rmse:0.18022	validati