In [13]:
import os
import config as config
from pathlib import Path
import hopsworks
from dotenv import load_dotenv

load_dotenv()
project = hopsworks.login(
    project=os.getenv("HOPSWORKS_PROJECT_NAME"),
    api_key_value=os.getenv("HOPSWORKS_API_KEY")
)
feature_store = project.get_feature_store()
feature_group=feature_store.get_or_create_feature_group(
    name=os.getenv("FEATURE_GROUP_NAME"),
    version=os.getenv("FEATURE_GROUP_VERSION"),
    description= "Time-series Data for Bike at six hour frequency",
    primary_key=["location_id","pickup_hour"],
    event_time="pickup_hour"
)

2025-05-10 21:19:41,240 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 21:19:41,257 INFO: Initializing external client
2025-05-10 21:19:41,258 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-10 21:19:41,895 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215649


In [14]:
from dotenv import load_dotenv

load_dotenv()
# Create a feature view if it doesn't already exist
try:
    feature_store.create_feature_view(
        name=os.getenv('FEATURE_VIEW_NAME'),
        version=os.getenv('FEATURE_VIEW_VERSION'),
        query=feature_group.select_all(),
    )
    print(f"Feature view '{os.getenv('FEATURE_VIEW_NAME')}' (version {os.getenv('FEATURE_VIEW_VERSION')}) created successfully.")
except Exception as e:
    print(f"Error creating feature view: {e}")

# Retrieve the feature view
try:
    feature_view = feature_store.get_feature_view(
        name=os.getenv('FEATURE_VIEW_NAME'),
        version=os.getenv('FEATURE_VIEW_VERSION'),
    )
    print(f"Feature view '{os.getenv('FEATURE_VIEW_NAME')}' (version {os.getenv('FEATURE_VIEW_VERSION')}) retrieved successfully.")
except Exception as e:
    print(f"Error retrieving feature view: {e}")

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1215649/fs/1203280/fv/time_series_six_hourly_feature_view_bike/version/1
Feature view 'time_series_six_hourly_feature_view_bike' (version 1) created successfully.
Feature view 'time_series_six_hourly_feature_view_bike' (version 1) retrieved successfully.


In [17]:
import joblib
def get_hopsworks_project() -> hopsworks.project.Project:
    return hopsworks.login(
        project=os.getenv('HOPSWORKS_PROJECT_NAME'), api_key_value=os.getenv('HOPSWORKS_API_KEY')
    )
project = get_hopsworks_project()
model_registry = project.get_model_registry()

models = model_registry.get_models(name='Bike_demand_predictor_next_hour')
model = max(models, key=lambda model: model.version)
model_dir = model.download()
model = joblib.load(Path(model_dir) / "lightgbm_bikeride_model.joblib")

2025-05-10 21:21:45,406 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 21:21:45,418 INFO: Initializing external client
2025-05-10 21:21:45,418 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-10 21:21:46,090 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215649


Downloading: 100.000%|██████████| 279868/279868 elapsed<00:00 remaining<00:00


Downloading model artifact (0 dirs, 1 files)... DONE

In [20]:
# Check expected vs actual features
expected_features = model.feature_name_
actual_features = X_pred.columns.tolist()

missing = set(expected_features) - set(actual_features)
extra   = set(actual_features) - set(expected_features)

print("❌ Missing features:", missing)
print("🔁 Extra features:", extra)


❌ Missing features: {'location_id'}
🔁 Extra features: set()


In [None]:
from datetime import timedelta
import pandas as pd

# Step 1: Load feature view data from Hopsworks
ts_data, _ = feature_view.training_data(
    description="time_series_six_hourly_bike_ride"
)

# Step 2: Preprocess location_id
ts_data["location_id"] = ts_data["location_id"].astype(str).str.replace('.', '', regex=False)
ts_data["pickup_hour"] = pd.to_datetime(ts_data["pickup_hour"])

# Keep only the 3 exact stations
valid_ids = {"614005", "590514", "532903"}
ts_data = ts_data[ts_data["location_id"].isin(valid_ids)]

# Step 3: Setup for prediction
full_df = ts_data.copy()
predictions = []

# Define prediction timeline and cleaned location IDs
future_dates = pd.date_range("2025-01-01 00:00:00", "2025-12-31 18:00:00", freq="6H", tz="UTC")
location_ids = sorted(valid_ids)  # keep it ordered

# Step 4: LightGBM expects these exact features
reg_features = [f"target_lag_{i+1}" for i in range(112)] + ["hour", "day_of_week", "month", "is_weekend", "location_id"]

print("🔮 Generating predictions for 2025...")

# Step 5: Rolling prediction loop
for ts in future_dates:
    for loc in location_ids:
        # Get latest 112 lag entries for this station
        hist = full_df[full_df["location_id"] == loc].sort_values("pickup_hour").tail(112)
        if len(hist) < 112:
            continue

        # Create lag features
        feature_row = {
            f"target_lag_{i+1}": hist.iloc[-(i+1)]["target"] for i in range(112)
        }

        # Add time-based features
        feature_row["hour"] = ts.hour
        feature_row["day_of_week"] = ts.dayofweek
        feature_row["month"] = ts.month
        feature_row["is_weekend"] = int(ts.dayofweek in [5, 6])
        feature_row["pickup_hour"] = ts
        feature_row["location_id"] = loc

        # Prepare DataFrame for prediction
        X_pred = pd.DataFrame([feature_row])[reg_features]
        X_pred["location_id"] = X_pred["location_id"].astype(float)  # ensure numeric for LGBM

        # Predict
        pred = model.predict(X_pred)[0]

        # Store prediction
        predictions.append({
            "pickup_hour": ts,
            "location_id": loc,
            "predicted_rides": round(pred)
        })

        # Append predicted row to history for future lags
        full_df = pd.concat([
            full_df,
            pd.DataFrame([{
                **feature_row,
                "target": pred
            }])
        ], ignore_index=True)

print("✅ 2025 predictions complete.")

# Step 6: Save predictions
pred_df = pd.DataFrame(predictions)
pred_df.to_csv("bike_predictions_2025_6hr.csv", index=False)
print("📁 Saved as bike_predictions_2025_6hr.csv")


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.95s) 
2025-05-10 21:28:59,348 INFO: Provenance cached data - overwriting last accessed/created training dataset from 3 to 4.




🔮 Generating predictions for 2025...


TypeError: Cannot compare tz-naive and tz-aware timestamps

In [18]:
from datetime import timedelta
import pandas as pd
ts_data, _ = feature_view.training_data(
    description="time_series_six_hourly_bike_ride"
)
# Use last known data as seed for predictions
full_df = ts_data.copy()
predictions = []

# 6-hour intervals in 2025
future_dates = pd.date_range("2025-01-01 00:00:00", "2025-12-31 18:00:00", freq="6H")
location_ids = full_df["location_id"].unique()

print("🔮 Generating predictions for 2025...")

for ts in future_dates:
    for loc in location_ids:
        # Get the most recent lags for this station
        hist = full_df[full_df["location_id"] == loc].sort_values("pickup_hour").tail(112)

        if len(hist) < 112:
            continue

        # Build feature row
        feature_row = {
            f"target_lag_{i+1}": hist.iloc[-(i+1)]["target"] for i in range(112)
        }
        feature_row["hour"] = ts.hour
        feature_row["day_of_week"] = ts.dayofweek
        feature_row["month"] = ts.month
        feature_row["is_weekend"] = int(ts.dayofweek in [5, 6])
        feature_row["pickup_hour"] = ts
        feature_row["location_id"] = loc

        # Define the feature list exactly as used during training
        reg_features = [f"target_lag_{i+1}" for i in range(112)] + ["hour", "day_of_week", "month", "is_weekend"]

        # Create DataFrame and predict
        X_pred = pd.DataFrame([feature_row])[reg_features]
        pred = model.predict(X_pred)[0]

        predictions.append({
            "pickup_hour": ts,
            "location_id": loc,
            "predicted_rides": round(pred)
        })

        # Append prediction as if it were real to enable rolling lags
        full_df = pd.concat([
            full_df,
            pd.DataFrame([{
                **feature_row,
                "target": pred
            }])
        ], ignore_index=True)

print("✅ 2025 predictions complete.")

# Convert to DataFrame and save
pred_df = pd.DataFrame(predictions)
pred_df.to_csv("bike_predictions_2025_6hr.csv", index=False)
print("📁 Saved as bike_predictions_2025_6hr.csv")


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.49s) 
2025-05-10 21:22:11,685 INFO: Provenance cached data - overwriting last accessed/created training dataset from 2 to 3.




🔮 Generating predictions for 2025...


LightGBMError: The number of features in data (116) is not the same as it was in training data (117).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.

In [None]:
ts_data.sort_values(["location_id", "pickup_hour"]).reset_index(drop=True)

Unnamed: 0,pickup_hour,location_id,target,target_lag_1,target_lag_2,target_lag_3,target_lag_4,target_lag_5,target_lag_6,target_lag_7,...,target_lag_107,target_lag_108,target_lag_109,target_lag_110,target_lag_111,target_lag_112,hour,day_of_week,month,is_weekend
