In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
prediction_raw_data = pd.read_parquet("../data/prediction_raw_data.parquet")

In [3]:
metadata = pd.read_csv("../data/metadata_sample_submission.csv", index_col=[0])
bank_holidays_bcn = pd.read_csv("../data/bank_holidays_bcn.csv", index_col=[0], parse_dates=["holiday_date"])
station_information = pd.read_csv("../data/station_information.csv")
meteo_data = pd.read_csv("../data/valores_booleanos_meteo.csv")

In [4]:
meteo_data['datetime'] = pd.to_datetime(meteo_data['data']).dt.strftime('%Y-%m-%d %H:%M:%S')
meteo_data['datetime'] = pd.to_datetime(meteo_data['datetime'])
meteo_data.drop(columns="data", inplace=True)

In [5]:
# create year column
metadata["year"]=2024
# transform station_id to int
metadata["station_id"]=metadata["station_id"].astype(str)

metadata["date"]= pd.to_datetime(metadata[["year","month","day"]])
metadata["datetime"]= pd.to_datetime(metadata[["year","month","day","hour"]])

In [6]:
metadata["is_holidays"] = metadata["date"].isin(bank_holidays_bcn["holiday_date"])

In [7]:
station_information["station_id"] = station_information["station_id"].astype(str)
metadata =metadata.merge(station_information[["station_id","lat","lon","post_code","capacity"]], on ="station_id", how="left")

In [8]:
metadata = metadata.merge(meteo_data[["datetime","calor","lluvia"]], on="datetime", how="left")

In [9]:
metadata["calor"] = metadata["calor"].fillna(0).astype(bool)
metadata["lluvia"] = metadata["lluvia"].fillna(0).astype(bool)

## Train

In [10]:
prediction_raw_data.head()

Unnamed: 0,station_id,lat,lon,altitude,post_code,year,month,day,hour,num_docks_available,capacity,percentage_docks_available,datetime,date,is_holidays,calor,lluvia
0,1,41.397978,2.180107,16.0,8013,2020,1,1,0,20.666667,45,0.459259,2020-01-01,2020-01-01,True,False,False
1,1,41.397978,2.180107,16.0,8013,2020,1,2,0,17.416667,45,0.387037,2020-01-02,2020-01-02,False,False,False
2,1,41.397978,2.180107,16.0,8013,2020,1,3,0,3.583333,45,0.07963,2020-01-03,2020-01-03,False,False,False
3,1,41.397978,2.180107,16.0,8013,2020,1,4,0,33.230769,45,0.738462,2020-01-04,2020-01-04,False,False,False
4,1,41.397978,2.180107,16.0,8013,2020,1,5,0,27.25,45,0.605556,2020-01-05,2020-01-05,False,False,False


In [None]:
from tqdm import tqdm
prediction_data = pd.DataFrame()
for s in tqdm(prediction_raw_data.station_id.unique()):
    ctx = prediction_raw_data.loc[prediction_raw_data["station_id"] == s, :]
    ctx = ctx.sort_values(by=["year", "month", "day", "hour"],
                              ignore_index=True)
    for lag in range(1, 5):
        ctx.loc[:, f"ctx-{lag}"] = ctx.loc[:, "percentage_docks_available"].shift(lag)

    ctx = ctx.iloc[4::5]

    prediction_data = pd.concat([prediction_data, ctx], ignore_index=True)

 89%|██████████████████████████████████████████████████████████████████████▉         | 354/399 [04:09<00:34,  1.32it/s]

In [None]:
prediction_data.drop(columns=["datetime","date","altitude", "num_docks_available"], inplace=True)

In [None]:
prediction_data["station_id"]= prediction_data["station_id"].astype(str)
prediction_data["post_code"]= prediction_data["post_code"].astype(str)

In [None]:
prediction_data.dropna(subset=["percentage_docks_available"], inplace=True)

In [None]:
prediction_data.dropna(subset=["ctx-1","ctx-2", "ctx-3", "ctx-4"],how="all", inplace=True)

In [None]:
train = prediction_data[(prediction_data["year"]<=2023) & (prediction_data["month"]<8)]
validation= prediction_data[(prediction_data["year"]==2023) & (prediction_data["month"]>=8)]

In [None]:
metadata[train.columns.drop('percentage_docks_available').tolist()]

In [None]:
x_train = train.drop(columns="percentage_docks_available")
y_train = train[["percentage_docks_available"]]


x_validation = validation.drop(columns="percentage_docks_available")
y_validation = validation[["percentage_docks_available"]]

In [None]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_val):
        self.max_val = max_val
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for column in X.columns:
            X[column + "_sin"] = np.sin(2 * np.pi * X[column] / self.max_val)
            X[column + "_cos"] = np.cos(2 * np.pi * X[column] / self.max_val)
            X.drop(columns=column, inplace=True)
        return X


In [None]:
transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["station_id", "post_code"]),
        ('hour_econder', CyclicalEncoder(max_val=24), ["hour"]),
        ('day_econder', CyclicalEncoder(max_val=31), ["day"]),
        
    ],
    remainder="passthrough"
)

In [None]:
x_train = transformer.fit_transform(x_train)
x_validation = transformer.transform(x_validation)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Define the model
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

# Train the model
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_validation)

# Evaluate the model
mse = mean_squared_error(y_validation, y_pred, squared=False)

In [None]:
mse