In [18]:
import numpy as np
import pandas as pd
import plotnine as gg
import xgboost as xgb

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv("../data/bicikelj_preprocessed.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [3]:
df.columns

Index(['Unnamed: 0', 'timestamp', 'station', 'target', 'time_of_day_sin',
       'time_of_day_cos', 'temperature', 'precipitation', 'snow_depth',
       'cloud_cover', 'is_day', 'praznik', 'poletne_pocitnice',
       'zimske_pocitnice', 'yesterday', 'day_of_week', 'daily_lag_1', 'lag_1',
       'lag_2', 'lag_3', 'lag_4', 'lag_5', 'number', 'contract_name', 'name',
       'address', 'banking', 'bonus', 'bike_stands', 'available_bike_stands',
       'available_bikes', 'status', 'last_update', 'position_lat',
       'position_lon', 'distance_to_center', 'is_weekend'],
      dtype='object')

In [4]:
# df["is_rain"] = df["precipitation"] > 0
df["station"] = df["station"].astype("category")
df = pd.get_dummies(df, columns=["station"])

In [5]:
cutoff_time = df["timestamp"].max() - pd.Timedelta(days=7)
train_df = df[df["timestamp"] <= cutoff_time]
test_df = df[df["timestamp"] > cutoff_time]

In [6]:
columns = ["Unnamed: 0", "zimske_pocitnice", "number", "contract_name", "name", "address", "banking", "bonus", "available_bike_stands", "available_bikes", "status", "last_update", "position_lat", "position_lon", "yesterday"]

train_df = train_df.drop(columns=columns)
test_df = test_df.drop(columns=columns)

In [7]:
X_train, y_train = train_df.drop(columns=["target", "timestamp"], axis=1), train_df['target']
X_test, y_test = test_df.drop(columns=["target", "timestamp"], axis=1), test_df['target']

In [8]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
print(mean_squared_error(y_true=y_test, y_pred=y_pred))
print(mean_absolute_error(y_true=y_test, y_pred=y_pred))

0.5957900004063623
0.2920129726337228


In [16]:
pd.DataFrame({"feature": X_train.columns.tolist(), "importance": model.feature_importances_}).sort_values("importance", ascending=False)

Unnamed: 0,feature,importance
11,lag_1,0.967418
67,station_POVŠETOVA-GRABLOVIČEVA,0.001548
64,station_POLJANSKA-POTOČNIKOVA,0.001475
77,station_SOSESKA NOVO BRDO,0.001307
87,station_TRŽAŠKA C.-ILIRIJA,0.001186
...,...,...
69,station_PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,0.000000
32,station_CESTA NA ROŽNIK,0.000000
58,station_P + R DOLGI MOST,0.000000
18,is_weekend,0.000000


In [17]:
test_df

Unnamed: 0,timestamp,target,time_of_day_sin,time_of_day_cos,temperature,precipitation,snow_depth,cloud_cover,is_day,praznik,...,station_VOJKOVA - GASILSKA BRIGADA,station_VOKA - SLOVENČEVA,station_ZALOG,station_ZALOŠKA C.-GRABLOVIČEVA C.,station_ČRNUČE,station_ŠMARTINSKI PARK,station_ŠPICA,station_ŠPORTNI CENTER STOŽICE,station_ŠTEPANJSKO NASELJE 1-JAKČEVA ULICA,station_ŽIVALSKI VRT
5809,2022-09-25 02:38:00,0,0.636078,0.771625,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
5810,2022-09-25 02:44:00,0,0.656059,0.754710,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
5811,2022-09-25 02:48:00,0,0.669131,0.743145,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
5812,2022-09-25 02:55:00,0,0.691513,0.722364,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
5813,2022-09-25 02:58:00,0,0.700909,0.713250,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558502,2022-10-01 18:03:00,12,-0.999914,0.013090,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,True
558503,2022-10-01 18:07:00,12,-0.999534,0.030539,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,True
558504,2022-10-01 18:13:00,12,-0.998392,0.056693,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,True
558505,2022-10-01 18:16:00,12,-0.997564,0.069756,14.9,0.0,0.0,31.0,0.0,0,...,False,False,False,False,False,False,False,False,False,True


In [None]:
# gg.ggplot(test_df)