In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Load the filtered data
data_path = os.path.join('..', 'data', 'citibike_columbia_model_ready.csv')
df = pd.read_csv(data_path)

df.head(5)

Unnamed: 0,station,hour,outflow,inflow,hour_of_day,year,month,weekday,is_weekend,is_holiday,inflow_lag_24,outflow_lag_24,inflow_lag_168,outflow_lag_168,station_id,temperature_2m,wind_speed_10m,precipitation,snowfall
0,Amsterdam Ave & W 119 St,2024-01-08 00:00:00,0.0,0.0,0,2024,1,0,0,0,1.0,0.0,0.0,0.0,0,0.031,10.383987,0.0,0.0
1,Amsterdam Ave & W 119 St,2024-01-08 01:00:00,0.0,0.0,1,2024,1,0,0,0,0.0,0.0,0.0,0.0,0,-0.069,13.089354,0.0,0.0
2,Amsterdam Ave & W 119 St,2024-01-08 02:00:00,0.0,0.0,2,2024,1,0,0,0,0.0,0.0,0.0,0.0,0,0.331,18.089775,0.0,0.0
3,Amsterdam Ave & W 119 St,2024-01-08 03:00:00,0.0,0.0,3,2024,1,0,0,0,0.0,0.0,0.0,0.0,0,0.231,17.57744,0.0,0.0
4,Amsterdam Ave & W 119 St,2024-01-08 04:00:00,0.0,0.0,4,2024,1,0,0,0,0.0,0.0,0.0,0.0,0,-0.219,14.4,0.0,0.0


In [3]:
df.shape

(111384, 19)

In [4]:
df["station_hour_mean_inflow"] = df.groupby(
    ["station_id", "hour_of_day"]
)["inflow"].transform("mean")

df["station_hour_mean_outflow"] = df.groupby(
    ["station_id", "hour_of_day"]
)["outflow"].transform("mean")

We dont want to shuffle timeseries so we will do manual split on a cutoff date. Our data start at 2024-01-08 and end at 2025-10-31. 

In [5]:
train_df = df[df["hour"] < "2025-07-01"]
test_df  = df[df["hour"] >= "2025-07-01"]

#Almost 80-20
len(train_df), len(test_df)

(90720, 20664)

In [6]:
features = [
    "station_hour_mean_inflow",
    "station_id",
    "hour_of_day",
    "weekday",
    "month",
    "is_weekend",
    "is_holiday",
    "inflow_lag_24",
    "inflow_lag_168",
    "temperature_2m",
    "wind_speed_10m",
    "precipitation",
    "snowfall"
]

target_1 = "inflow"

Simple baseline Linear Regression using sk-learn

In [7]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(train_df[features], train_df[target_1])

test_df.loc[:,"pred_inflow"] = reg.predict(test_df[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:,"pred_inflow"] = reg.predict(test_df[features])


In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_true = test_df[target_1]
y_pred = test_df["pred_inflow"]

rmse_linear = np.sqrt(mean_squared_error(y_true, y_pred))
mae_linear  = mean_absolute_error(y_true, y_pred)
r2_linear   = r2_score(y_true, y_pred)

print("RMSE:", rmse_linear)
print("MAE:", mae_linear)
print("R²:", r2_linear)


RMSE: 2.532550762535471
MAE: 1.7324296547681135
R²: 0.5150185188383354


We are off by approximately 2.6 bikes for inflow, not good enough

We will try XGBoost

In [9]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=600,
    learning_rate=0.01,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)

In [10]:
model.fit(train_df[features], train_df[target_1])
test_df.loc[:, "pred_inflow"] = model.predict(test_df[features])

In [11]:
#Lets evaluate
y_true = test_df[target_1]
y_pred = test_df["pred_inflow"]

rmse_xgboost = np.sqrt(mean_squared_error(y_true, y_pred))
mae_xgbost = mean_absolute_error(y_true, y_pred)
r2_xgboost = r2_score(y_true, y_pred)

print("RMSE:", rmse_xgboost)
print("MAE:", mae_xgbost)
print("R²:", r2_xgboost)

RMSE: 2.405147914525926
MAE: 1.6194417763711115
R²: 0.5625862631423375


Lets try LightGBM

In [12]:
from lightgbm import LGBMRegressor

target_1 = "inflow"   

lgbm = LGBMRegressor(
    n_estimators=650,
    learning_rate=0.01,
    max_depth=-1,         # let LGBM choose depth
    num_leaves=45,        # complexity control
    subsample=0.9,        # same as subsample
    colsample_bytree=0.9, # same as colsample
    random_state=42,
    objective="regression"
)

lgbm.fit(
    train_df[features],
    train_df[target_1],
    eval_set=[(test_df[features], test_df[target_1])]
)


[WinError 2] The system cannot find the file specified
  File "c:\Users\Αφροδίτη Φραγκιαδάκη\da-project-citibike\.venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
  File "c:\Users\Αφροδίτη Φραγκιαδάκη\da-project-citibike\.venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
  File "C:\Python_3.10\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Python_3.10\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Python_3.10\lib\subprocess.py", line 1440, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 898
[LightGBM] [Info] Number of data points in the train set: 90720, number of used features: 13
[LightGBM] [Info] Start training from score 2.308918


0,1,2
,boosting_type,'gbdt'
,num_leaves,45
,max_depth,-1
,learning_rate,0.01
,n_estimators,650
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [13]:
test_df.loc[:,"pred_inflow"] = lgbm.predict(test_df[features])

In [14]:
y_true = test_df[target_1]
y_pred = test_df["pred_inflow"]

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae  = mean_absolute_error(y_true, y_pred)
r2   = r2_score(y_true, y_pred)

print("LightGBM RMSE:", rmse)
print("LightGBM MAE:", mae)
print("LightGBM R²:", r2)


LightGBM RMSE: 2.416682104032082
LightGBM MAE: 1.6277854251875818
LightGBM R²: 0.5583808582441658


Creating a deep-learning architecture

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = train_df[features]
X_test = test_df[features]

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train = train_df[target_1]
y_test = test_df[target_1]

In [None]:
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout # type: ignore

mlp = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

mlp.compile(optimizer='adam', loss='mse')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
history = mlp.fit(
    X_train_scaled, y_train,
    validation_split=0.1,
    epochs=30,
    batch_size=256,
    verbose=1
)

Epoch 1/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 4.8780 - val_loss: 4.4715
Epoch 2/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.2981 - val_loss: 4.4466
Epoch 3/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 4.2290 - val_loss: 4.3982
Epoch 4/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 4.1872 - val_loss: 4.4306
Epoch 5/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.1387 - val_loss: 4.3772
Epoch 6/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 4.1269 - val_loss: 4.3292
Epoch 7/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 4.0950 - val_loss: 4.3550
Epoch 8/30
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.0808 - val_loss: 4.3401
Epoch 9/30
[1m319/319[0m [32m━━━━━━━━

In [19]:
y_pred_mlp = mlp.predict(X_test_scaled).flatten()

rmse = np.sqrt(mean_squared_error(y_test, y_pred_mlp))
mae  = mean_absolute_error(y_test, y_pred_mlp)
r2   = r2_score(y_test, y_pred_mlp)

print("MLP RMSE:", rmse)
print("MLP MAE:", mae)
print("MLP R²:", r2)


[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
MLP RMSE: 2.426704872710137
MLP MAE: 1.6218497724500545
MLP R²: 0.5547101851027043
