In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

Open the raw data ready to be used to train

In [2]:
prediction_raw_data = pd.read_parquet("../data/prediction_raw_data.parquet")

pl_cat_coords = (41.386949, 2.170080)

prediction_raw_data["pl_distance"] = prediction_raw_data.apply(
    lambda x: np.sqrt((x["lat"] - pl_cat_coords[0])**2 + (x["lon"] - pl_cat_coords[1])**2), axis=1
)

In [3]:
def capacity_category(df):
    df = df.copy()
    conditions = [
        (df['capacity'] < 20),
        (df['capacity'] >=20) & (df['capacity'] < 25),
        (df['capacity'] >= 25) & (df['capacity'] < 30),
        (df['capacity'] >= 30) & (df['capacity'] < 40),
        (df['capacity'] >= 40)]

    choices = ['<20', '20-25', '25-30', '30-40', '>40']


    df['capacity_category'] = np.select(conditions, choices)
    
    return df
    

In [4]:
prediction_raw_data = capacity_category(prediction_raw_data)

These csvs are to add the metadata to the stations/days of the test set

In [5]:
metadata = pd.read_csv("../data/metadata_sample_submission.csv", index_col=[0])
bank_holidays_bcn = pd.read_csv("../data/bank_holidays_bcn.csv", index_col=[0], parse_dates=["holiday_date"])
station_information = pd.read_csv("../data/station_information.csv")
meteo_data = pd.read_csv("../data/valores_booleanos_meteo.csv")

In [6]:
station_information["pl_distance"] = station_information.apply(
    lambda x: np.sqrt((x["lat"] - pl_cat_coords[0])**2 + (x["lon"] - pl_cat_coords[1])**2), axis=1
)

In [7]:
meteo_data['datetime'] = pd.to_datetime(meteo_data['data']).dt.strftime('%Y-%m-%d %H:%M:%S')
meteo_data['datetime'] = pd.to_datetime(meteo_data['datetime'])
meteo_data.drop(columns="data", inplace=True)

In [8]:
# create year column
metadata["year"]=2024
# transform station_id to int
metadata["station_id"]=metadata["station_id"].astype(str)

metadata["date"]= pd.to_datetime(metadata[["year","month","day"]])
metadata["datetime"]= pd.to_datetime(metadata[["year","month","day","hour"]])

In [9]:
metadata["is_holidays"] = metadata["date"].isin(bank_holidays_bcn["holiday_date"])

In [10]:
station_information["station_id"] = station_information["station_id"].astype(str)
metadata =metadata.merge(station_information[["station_id","pl_distance","altitude", "lat", "lon", "post_code","capacity"]], on ="station_id", how="left")

In [11]:
metadata = metadata.merge(meteo_data[["datetime","calor","lluvia","dia"]], on="datetime", how="left")

In [12]:
metadata["calor"] = metadata["calor"].fillna(0).astype(bool)
metadata["lluvia"] = metadata["lluvia"].fillna(0).astype(bool)
metadata["dia"] = metadata["dia"].fillna(0).astype(bool)

In [13]:
metadata = capacity_category(metadata)

## Train

In [14]:
from tqdm import tqdm
prediction_data = pd.DataFrame()
for s in tqdm(prediction_raw_data.station_id.unique()):
    ctx = prediction_raw_data.loc[prediction_raw_data["station_id"] == s, :]
    ctx = ctx.sort_values(by=["year", "month", "day", "hour"],
                              ignore_index=True)
    for lag in range(1, 5):
        ctx.loc[:, f"ctx-{lag}"] = ctx.loc[:, "percentage_docks_available"].shift(lag)

    ctx = ctx.iloc[4::5]

    prediction_data = pd.concat([prediction_data, ctx], ignore_index=True)

100%|██████████| 399/399 [03:48<00:00,  1.75it/s]


In [15]:
prediction_data.drop(columns=["datetime","date", "num_docks_available", "capacity"], inplace=True)

In [16]:
prediction_data["station_id"]= prediction_data["station_id"].astype(str)
prediction_data["post_code"]= prediction_data["post_code"].astype(str)

In [17]:
prediction_data.dropna(subset=["percentage_docks_available"], inplace=True)

In [18]:
prediction_data.dropna(subset=["ctx-1","ctx-2", "ctx-3", "ctx-4"],how="all", inplace=True)

In [19]:
train = prediction_data[(prediction_data["year"]<=2023) & (prediction_data["month"]<8)]
validation= prediction_data[(prediction_data["year"]==2023) & (prediction_data["month"]>=8)]

In [20]:
metadata

Unnamed: 0,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,year,date,...,pl_distance,altitude,lat,lon,post_code,capacity,calor,lluvia,dia,capacity_category
0,1,1,1,5,0.781481,0.677778,0.696296,0.750000,2024,2024-01-01,...,0.014906,16.0,41.397978,2.180107,8013,45,False,False,False,>40
1,1,1,1,10,0.737374,0.711111,0.711111,0.731624,2024,2024-01-01,...,0.014906,16.0,41.397978,2.180107,8013,45,False,False,True,>40
2,1,1,1,15,0.827778,0.896296,0.901852,0.883333,2024,2024-01-01,...,0.014906,16.0,41.397978,2.180107,8013,45,False,False,True,>40
3,1,1,1,20,0.825926,0.874074,0.927778,0.918519,2024,2024-01-01,...,0.014906,16.0,41.397978,2.180107,8013,45,False,False,False,>40
4,2,1,1,3,0.592593,0.341954,0.275862,0.540230,2024,2024-01-01,...,0.011117,17.0,41.395488,2.177198,8013,29,False,False,False,25-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171897,496,3,31,1,0.828704,0.787037,0.777778,0.853535,2024,2024-03-31,...,0.018524,33.0,41.404862,2.174799,8025,18,False,False,False,<20
171898,496,3,31,6,0.930556,0.944444,0.935185,0.856481,2024,2024-03-31,...,0.018524,33.0,41.404862,2.174799,8025,18,False,False,True,<20
171899,496,3,31,11,0.912037,0.884259,0.518519,0.157407,2024,2024-03-31,...,0.018524,33.0,41.404862,2.174799,8025,18,False,False,True,<20
171900,496,3,31,16,0.245370,0.319444,0.277778,0.305556,2024,2024-03-31,...,0.018524,33.0,41.404862,2.174799,8025,18,False,False,True,<20


In [21]:
x_train = train.drop(columns="percentage_docks_available")
y_train = train[["percentage_docks_available"]]


x_validation = validation.drop(columns="percentage_docks_available")
y_validation = validation[["percentage_docks_available"]]

metadata=metadata[train.columns.drop(["percentage_docks_available"]).tolist()]
metadata["post_code"]= metadata["post_code"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata["post_code"]= metadata["post_code"].astype(str)


In [22]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_val):
        self.max_val = max_val
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for column in X.columns:
            X[column + "_sin"] = np.sin(2 * np.pi * X[column] / self.max_val)
            X[column + "_cos"] = np.cos(2 * np.pi * X[column] / self.max_val)
            X.drop(columns=column, inplace=True)
        return X


In [23]:
x_train

Unnamed: 0,station_id,lat,lon,altitude,post_code,year,month,day,hour,is_holidays,calor,lluvia,dia,pl_distance,capacity_category,ctx-1,ctx-2,ctx-3,ctx-4
0,1,41.397978,2.180107,16.0,8013,2020,1,1,4,True,False,False,False,0.014906,>40,0.283333,0.346296,0.394444,0.459259
1,1,41.397978,2.180107,16.0,8013,2020,1,1,9,True,False,False,True,0.014906,>40,0.248148,0.235185,0.233333,0.298148
2,1,41.397978,2.180107,16.0,8013,2020,1,1,14,True,False,False,True,0.014906,>40,0.381481,0.385185,0.337037,0.342593
3,1,41.397978,2.180107,16.0,8013,2020,1,1,19,True,False,False,False,0.014906,>40,0.079630,0.177778,0.211111,0.244444
4,1,41.397978,2.180107,16.0,8013,2020,1,2,0,False,False,False,False,0.014906,>40,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2796111,496,41.404862,2.174799,33.0,8025,2023,7,31,1,False,False,False,False,0.018524,<20,0.555556,0.585859,0.606838,0.370370
2796112,496,41.404862,2.174799,33.0,8025,2023,7,31,6,False,False,False,True,0.018524,<20,0.675926,0.657407,0.620370,0.611111
2796113,496,41.404862,2.174799,33.0,8025,2023,7,31,11,False,False,False,True,0.018524,<20,0.509259,0.486111,0.513889,0.560185
2796114,496,41.404862,2.174799,33.0,8025,2023,7,31,16,False,False,False,True,0.018524,<20,0.601852,0.560185,0.601010,0.615385


In [24]:
transformer = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), ["pl_distance", "altitude", "lat", "lon"]),
        ('onehot', OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["station_id","post_code","capacity_category"]),
        ('hour_econder', CyclicalEncoder(max_val=24), ["hour"]),
        ('day_econder', CyclicalEncoder(max_val=31), ["day"]),
    ],
    remainder="passthrough"
)

In [25]:
x_train_ = transformer.fit_transform(x_train)
x_validation_ = transformer.transform(x_validation)
metadata_ = transformer.transform(metadata)

In [26]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Define the model
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=1200, max_depth=8, eta=0.1, subsample=0.7)

# Train the model
model.fit(x_train_, y_train)

# Make predictions
y_pred = model.predict(x_validation_)

# Evaluate the model
mse = mean_squared_error(y_validation, y_pred, squared=False)



In [27]:
mse

0.09441939904724544

# PREDICTION

In [30]:
y_pred = model.predict(metadata_)

In [31]:
submission= pd.DataFrame(y_pred,columns=["percentage_docks_available"])

In [32]:
submission["index"] = metadata.index

In [33]:
submission[["index","percentage_docks_available"]].to_csv("submission_quim_2.csv", index=False)