In [2]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [7]:
prediction_raw_data = pd.read_parquet("../data/prediction_raw_data.parquet")

In [14]:
prediction_raw_data

Unnamed: 0,station_id,lat,lon,altitude,post_code,year,month,day,hour,num_docks_available,capacity,percentage_docks_available,datetime,date,is_holidays,calor,lluvia,dia
0,1,41.397978,2.180107,16.0,8013,2020,1,1,0,20.666667,45,0.459259,2020-01-01 00:00:00,2020-01-01,True,False,False,False
1,1,41.397978,2.180107,16.0,8013,2020,1,2,0,17.416667,45,0.387037,2020-01-02 00:00:00,2020-01-02,False,False,False,False
2,1,41.397978,2.180107,16.0,8013,2020,1,3,0,3.583333,45,0.079630,2020-01-03 00:00:00,2020-01-03,False,False,False,False
3,1,41.397978,2.180107,16.0,8013,2020,1,4,0,33.230769,45,0.738462,2020-01-04 00:00:00,2020-01-04,False,False,False,False
4,1,41.397978,2.180107,16.0,8013,2020,1,5,0,27.250000,45,0.605556,2020-01-05 00:00:00,2020-01-05,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13985828,496,41.404862,2.174799,33.0,8025,2020,4,12,13,,18,,2020-04-12 13:00:00,2020-04-12,False,False,False,True
13985829,496,41.404862,2.174799,33.0,8025,2023,5,29,1,,18,,2023-05-29 01:00:00,2023-05-29,False,False,False,False
13985830,496,41.404862,2.174799,33.0,8025,2023,10,20,5,,18,,2023-10-20 05:00:00,2023-10-20,False,False,False,False
13985831,496,41.404862,2.174799,33.0,8025,2020,8,8,18,,18,,2020-08-08 18:00:00,2020-08-08,False,False,False,True


In [17]:
# Define conditions and choices
conditions = [
    (prediction_raw_data['capacity'] < 20),
    (prediction_raw_data['capacity'] >=20) & (prediction_raw_data['capacity'] < 25),
    (prediction_raw_data['capacity'] >= 25) & (prediction_raw_data['capacity'] < 30),
    (prediction_raw_data['capacity'] >= 30) & (prediction_raw_data['capacity'] < 40),
    (prediction_raw_data['capacity'] >= 40)
]

choices = ['<20', '20-25', '25-30', '30-40', '>40']

# Create the categorical column
prediction_raw_data['capacity_category'] = np.select(conditions, choices)

In [20]:
metadata = pd.read_csv("../data/metadata_sample_submission.csv", index_col=[0])
bank_holidays_bcn = pd.read_csv("../data/bank_holidays_bcn.csv", index_col=[0], parse_dates=["holiday_date"])
station_information = pd.read_csv("../data/station_information.csv")
meteo_data = pd.read_csv("../data/valores_booleanos_meteo.csv")

In [21]:
meteo_data['datetime'] = pd.to_datetime(meteo_data['data']).dt.strftime('%Y-%m-%d %H:%M:%S')
meteo_data['datetime'] = pd.to_datetime(meteo_data['datetime'])
meteo_data.drop(columns="data", inplace=True)

In [22]:
# create year column
metadata["year"]=2024
# transform station_id to int
metadata["station_id"]=metadata["station_id"].astype(str)

metadata["date"]= pd.to_datetime(metadata[["year","month","day"]])
metadata["datetime"]= pd.to_datetime(metadata[["year","month","day","hour"]])

In [23]:
metadata["is_holidays"] = metadata["date"].isin(bank_holidays_bcn["holiday_date"])

In [24]:
station_information["station_id"] = station_information["station_id"].astype(str)
metadata =metadata.merge(station_information[["station_id","lat","lon","post_code","capacity"]], on ="station_id", how="left")

In [25]:
metadata = metadata.merge(meteo_data[["datetime","calor","lluvia","dia"]], on="datetime", how="left")

In [28]:
metadata["calor"] = metadata["calor"].fillna(0).astype(bool)
metadata["lluvia"] = metadata["lluvia"].fillna(0).astype(bool)
metadata["dia"] = metadata["dia"].fillna(0).astype(bool)

## Train

In [None]:
from tqdm import tqdm
prediction_data = pd.DataFrame()
for s in tqdm(prediction_raw_data.station_id.unique()):
    ctx = prediction_raw_data.loc[prediction_raw_data["station_id"] == s, :]
    ctx = ctx.sort_values(by=["year", "month", "day", "hour"],
                              ignore_index=True)
    for lag in range(1, 5):
        ctx.loc[:, f"ctx-{lag}"] = ctx.loc[:, "percentage_docks_available"].shift(lag)

    ctx = ctx.iloc[4::5]

    prediction_data = pd.concat([prediction_data, ctx], ignore_index=True)

 34%|███████████████████████████▎                                                    | 136/399 [00:58<02:04,  2.12it/s]

In [41]:
prediction_data.drop(columns=["datetime","date","altitude", "num_docks_available"], inplace=True)

In [42]:
prediction_data["station_id"]= prediction_data["station_id"].astype(str)
prediction_data["post_code"]= prediction_data["post_code"].astype(str)

In [43]:
prediction_data.dropna(subset=["percentage_docks_available"], inplace=True)

In [44]:
prediction_data.dropna(subset=["ctx-1","ctx-2", "ctx-3", "ctx-4"],how="all", inplace=True)

In [45]:
train = prediction_data[(prediction_data["year"]<=2023) & (prediction_data["month"]<8)]
validation= prediction_data[(prediction_data["year"]==2023) & (prediction_data["month"]>=8)]

In [47]:
train

Unnamed: 0,station_id,lat,lon,post_code,year,month,day,hour,capacity,percentage_docks_available,is_holidays,calor,lluvia,ctx-1,ctx-2,ctx-3,ctx-4
0,1,41.397978,2.180107,8013,2020,1,1,4,45,0.364815,True,False,False,0.283333,0.346296,0.394444,0.459259
1,1,41.397978,2.180107,8013,2020,1,1,9,45,0.262963,True,False,False,0.248148,0.235185,0.233333,0.298148
2,1,41.397978,2.180107,8013,2020,1,1,14,45,0.335185,True,False,False,0.381481,0.385185,0.337037,0.342593
3,1,41.397978,2.180107,8013,2020,1,1,19,45,0.209259,True,False,False,0.079630,0.177778,0.211111,0.244444
4,1,41.397978,2.180107,8013,2020,1,2,0,45,0.387037,False,False,False,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2796111,496,41.404862,2.174799,8025,2023,7,31,1,18,0.620370,False,False,False,0.555556,0.585859,0.606838,0.370370
2796112,496,41.404862,2.174799,8025,2023,7,31,6,18,0.652778,False,False,False,0.675926,0.657407,0.620370,0.611111
2796113,496,41.404862,2.174799,8025,2023,7,31,11,18,0.495370,False,False,False,0.509259,0.486111,0.513889,0.560185
2796114,496,41.404862,2.174799,8025,2023,7,31,16,18,0.500000,False,False,False,0.601852,0.560185,0.601010,0.615385


In [48]:
x_train = train.drop(columns="percentage_docks_available")
y_train = train[["percentage_docks_available"]]


x_validation = validation.drop(columns="percentage_docks_available")
y_validation = validation[["percentage_docks_available"]]


metadata=metadata[train.columns.drop(["percentage_docks_available").tolist()]
# metadata["post_code"]= metadata["post_code"].astype(str)

In [52]:
x_train

Unnamed: 0,station_id,lat,lon,year,month,day,hour,capacity,is_holidays,calor,lluvia,ctx-1,ctx-2,ctx-3,ctx-4
0,1,41.397978,2.180107,2020,1,1,4,45,True,False,False,0.283333,0.346296,0.394444,0.459259
1,1,41.397978,2.180107,2020,1,1,9,45,True,False,False,0.248148,0.235185,0.233333,0.298148
2,1,41.397978,2.180107,2020,1,1,14,45,True,False,False,0.381481,0.385185,0.337037,0.342593
3,1,41.397978,2.180107,2020,1,1,19,45,True,False,False,0.079630,0.177778,0.211111,0.244444
4,1,41.397978,2.180107,2020,1,2,0,45,False,False,False,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2796111,496,41.404862,2.174799,2023,7,31,1,18,False,False,False,0.555556,0.585859,0.606838,0.370370
2796112,496,41.404862,2.174799,2023,7,31,6,18,False,False,False,0.675926,0.657407,0.620370,0.611111
2796113,496,41.404862,2.174799,2023,7,31,11,18,False,False,False,0.509259,0.486111,0.513889,0.560185
2796114,496,41.404862,2.174799,2023,7,31,16,18,False,False,False,0.601852,0.560185,0.601010,0.615385


In [49]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_val):
        self.max_val = max_val
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for column in X.columns:
            X[column + "_sin"] = np.sin(2 * np.pi * X[column] / self.max_val)
            X[column + "_cos"] = np.cos(2 * np.pi * X[column] / self.max_val)
            X.drop(columns=column, inplace=True)
        return X


In [50]:
transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["station_id","post_code"]),
        ('hour_econder', CyclicalEncoder(max_val=24), ["hour"]),
        ('day_econder', CyclicalEncoder(max_val=31), ["day"]),
    ],
    remainder="passthrough"
)

In [51]:
x_train_ = transformer.fit_transform(x_train)
x_validation_ = transformer.transform(x_validation)
metadata_ = transformer.transform(metadata)

MemoryError: Unable to allocate 4.67 GiB for an array with shape (1510069, 415) and data type object

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Define the model
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, max_depth=7, eta=0.1, subsample=0.7)

# Train the model
model.fit(x_train_, y_train)

# Make predictions
y_pred = model.predict(x_validation_)

# Evaluate the model
mse = mean_squared_error(y_validation, y_pred, squared=False)

In [None]:
mse

# PREDICTION

In [23]:
y_pred = model.predict(metadata_)

In [24]:
submission= pd.DataFrame(y_pred,columns=["percentage_docks_available"])

In [25]:
submission["index"] = metadata.index

In [27]:
# submission[["index","percentage_docks_available"]].to_csv("submission_quim_1.csv", index=False)