In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
prediction_raw_data = pd.read_parquet("../data/prediction_raw_data.parquet")

In [3]:
def capacity_category(df):
    df = df.copy()
    conditions = [
        (df['capacity'] < 20),
        (df['capacity'] >=20) & (df['capacity'] < 25),
        (df['capacity'] >= 25) & (df['capacity'] < 30),
        (df['capacity'] >= 30) & (df['capacity'] < 40),
        (df['capacity'] >= 40)]

    choices = ['<20', '20-25', '25-30', '30-40', '>40']


    df['capacity_category'] = np.select(conditions, choices)
    
    return df
    

In [4]:
prediction_raw_data = capacity_category(prediction_raw_data)

In [5]:
metadata = pd.read_csv("../data/metadata_sample_submission.csv", index_col=[0])
bank_holidays_bcn = pd.read_csv("../data/bank_holidays_bcn.csv", index_col=[0], parse_dates=["holiday_date"])
station_information = pd.read_csv("../data/station_information.csv")
meteo_data = pd.read_csv("../data/valores_booleanos_meteo.csv")

In [6]:
meteo_data['datetime'] = pd.to_datetime(meteo_data['data']).dt.strftime('%Y-%m-%d %H:%M:%S')
meteo_data['datetime'] = pd.to_datetime(meteo_data['datetime'])
meteo_data.drop(columns="data", inplace=True)

In [7]:
# create year column
metadata["year"]=2024
# transform station_id to int
metadata["station_id"]=metadata["station_id"].astype(str)

metadata["date"]= pd.to_datetime(metadata[["year","month","day"]])
metadata["datetime"]= pd.to_datetime(metadata[["year","month","day","hour"]])

In [8]:
metadata["is_holidays"] = metadata["date"].isin(bank_holidays_bcn["holiday_date"])

In [9]:
station_information["station_id"] = station_information["station_id"].astype(str)
metadata =metadata.merge(station_information[["station_id","lat","lon","post_code","capacity"]], on ="station_id", how="left")

In [10]:
metadata = metadata.merge(meteo_data[["datetime","calor","lluvia","dia"]], on="datetime", how="left")

In [11]:
metadata["calor"] = metadata["calor"].fillna(0).astype(bool)
metadata["lluvia"] = metadata["lluvia"].fillna(0).astype(bool)
metadata["dia"] = metadata["dia"].fillna(0).astype(bool)

In [12]:
metadata = capacity_category(metadata)

## Train

In [13]:
from tqdm import tqdm
prediction_data = pd.DataFrame()
for s in tqdm(prediction_raw_data.station_id.unique()):
    ctx = prediction_raw_data.loc[prediction_raw_data["station_id"] == s, :]
    ctx = ctx.sort_values(by=["year", "month", "day", "hour"],
                              ignore_index=True)
    for lag in range(1, 5):
        ctx.loc[:, f"ctx-{lag}"] = ctx.loc[:, "percentage_docks_available"].shift(lag)

    ctx = ctx.iloc[4::5]

    prediction_data = pd.concat([prediction_data, ctx], ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████████| 399/399 [11:49<00:00,  1.78s/it]


In [14]:
prediction_data.drop(columns=["datetime","date","altitude", "num_docks_available", "capacity"], inplace=True)

In [15]:
prediction_data["station_id"]= prediction_data["station_id"].astype(str)
prediction_data["post_code"]= prediction_data["post_code"].astype(str)

In [16]:
prediction_data.dropna(subset=["percentage_docks_available"], inplace=True)

In [17]:
prediction_data.dropna(subset=["ctx-1","ctx-2", "ctx-3", "ctx-4"],how="all", inplace=True)

In [18]:
train = prediction_data[(prediction_data["year"]<=2023) & (prediction_data["month"]<8)]
validation= prediction_data[(prediction_data["year"]==2023) & (prediction_data["month"]>=8)]

In [19]:
x_train = train.drop(columns="percentage_docks_available")
y_train = train[["percentage_docks_available"]]


x_validation = validation.drop(columns="percentage_docks_available")
y_validation = validation[["percentage_docks_available"]]


metadata=metadata[train.columns.drop(["percentage_docks_available"]).tolist()]
metadata["post_code"]= metadata["post_code"].astype(str)

In [20]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_val):
        self.max_val = max_val
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for column in X.columns:
            X[column + "_sin"] = np.sin(2 * np.pi * X[column] / self.max_val)
            X[column + "_cos"] = np.cos(2 * np.pi * X[column] / self.max_val)
            X.drop(columns=column, inplace=True)
        return X


In [21]:
transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["station_id","post_code","capacity_category"]),
        ('hour_econder', CyclicalEncoder(max_val=24), ["hour"]),
        ('day_econder', CyclicalEncoder(max_val=31), ["day"]),
    ],
    remainder="passthrough"
)

In [22]:
x_train_ = transformer.fit_transform(x_train)
x_validation_ = transformer.transform(x_validation)
metadata_ = transformer.transform(metadata)

In [23]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Define the model
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, max_depth=7, eta=0.1, subsample=0.7)

# Train the model
model.fit(x_train_, y_train)

# Make predictions
y_pred = model.predict(x_validation_)

# Evaluate the model
mse = mean_squared_error(y_validation, y_pred, squared=False)

In [24]:
mse

0.09572475326577472

# PREDICTION

In [25]:
y_pred = model.predict(metadata_)

In [26]:
submission= pd.DataFrame(y_pred,columns=["percentage_docks_available"])

In [27]:
submission["index"] = metadata.index

In [29]:
submission[["index","percentage_docks_available"]].to_csv("submission_quim_2.csv", index=False)