In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Show all output for a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [4]:
import hopsworks

project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME, api_key_value=config.HOPSWORKS_API_KEY
)

feature_store = project.get_feature_store()
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

2025-05-11 08:51:58,626 INFO: Initializing external client
2025-05-11 08:51:58,627 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-11 08:51:59,434 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1231009


In [5]:
# Create a feature view if it doesn't already exist
try:
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all(),
    )
    print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) created successfully.")
except Exception as e:
    print(f"Error creating feature view: {e}")

# Retrieve the feature view
try:
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
    )
    print(f"Feature view '{config.FEATURE_VIEW_NAME}' (version {config.FEATURE_VIEW_VERSION}) retrieved successfully.")
except Exception as e:
    print(f"Error retrieving feature view: {e}")

Error creating feature view: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1231009/featurestores/1213541/featureview). Server response: 
HTTP code: 400, HTTP reason: Bad Request, body: b'{"errorCode":270179,"usrMsg":"Feature view: citi_bike_time_series_hourly_feature_view, version: 1","errorMsg":"The provided feature view name and version already exists"}', error code: 270179, error msg: The provided feature view name and version already exists, user msg: Feature view: citi_bike_time_series_hourly_feature_view, version: 1
Feature view 'citi_bike_time_series_hourly_feature_view' (version 1) retrieved successfully.


In [6]:
citi_bike_ts_data, _ = feature_view.training_data(description="Time series hourly citi bike rides")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.60s) 




In [7]:
citi_bike_ts_data = citi_bike_ts_data.sort_values(["start_station_id", "hour"]).reset_index(drop=True)

In [8]:
citi_bike_ts_data.head()

Unnamed: 0,hour,start_station_id,ride_count
0,2025-04-12 10:00:00+00:00,5329.03,12
1,2025-04-12 11:00:00+00:00,5329.03,22
2,2025-04-12 12:00:00+00:00,5329.03,18
3,2025-04-12 13:00:00+00:00,5329.03,18
4,2025-04-12 14:00:00+00:00,5329.03,13


In [9]:
citi_bike_ts_data["hour"].min()

'2024-01-01 00:00:00+00:00'

In [10]:
citi_bike_ts_data["hour"].max()

'2025-05-11 11:00:00+00:00'

In [11]:
citi_bike_ts_data.shape

(28446, 3)

In [12]:
citi_bike_ts_data['start_station_id'].value_counts()

start_station_id
5905.14    9482
6140.05    9482
6450.05    8784
5329.03     698
Name: count, dtype: int64

In [13]:
citi_bike_ts_data[citi_bike_ts_data['start_station_id'] == '5329.03']

Unnamed: 0,hour,start_station_id,ride_count
0,2025-04-12 10:00:00+00:00,5329.03,12
1,2025-04-12 11:00:00+00:00,5329.03,22
2,2025-04-12 12:00:00+00:00,5329.03,18
3,2025-04-12 13:00:00+00:00,5329.03,18
4,2025-04-12 14:00:00+00:00,5329.03,13
...,...,...,...
693,2025-05-11 07:00:00+00:00,5329.03,1
694,2025-05-11 08:00:00+00:00,5329.03,2
695,2025-05-11 09:00:00+00:00,5329.03,2
696,2025-05-11 10:00:00+00:00,5329.03,3


In [14]:
import pandas as pd
citi_bike_ts_data["hour"] = pd.to_datetime(citi_bike_ts_data["hour"], errors="coerce")

In [15]:
citi_bike_ts_data["hour"] = citi_bike_ts_data["hour"].dt.tz_localize(None)

In [16]:
citi_bike_ts_data["year_month"] = citi_bike_ts_data["hour"].dt.to_period("M")

In [17]:
gte = citi_bike_ts_data["year_month"] >= pd.Period("2024-01", freq="M")
lte = citi_bike_ts_data["year_month"] <= pd.Period("2024-12", freq="M")
cond = gte & lte
filtered_data = citi_bike_ts_data[cond].reset_index(drop=True)

In [18]:
filtered_data.drop(columns=["year_month"], inplace=True) 

In [19]:
ts_data = filtered_data

In [20]:
ts_data.head()

Unnamed: 0,hour,start_station_id,ride_count
0,2024-01-01 00:00:00,5905.14,1
1,2024-01-01 01:00:00,5905.14,2
2,2024-01-01 02:00:00,5905.14,1
3,2024-01-01 03:00:00,5905.14,2
4,2024-01-01 04:00:00,5905.14,0


In [21]:
from src.data_utils import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(ts_data, window_size=24*28, step_size=23)

In [22]:
features.sort_values(["start_station_id", "hour"])

Unnamed: 0,ride_count_t-672,ride_count_t-671,ride_count_t-670,ride_count_t-669,ride_count_t-668,ride_count_t-667,ride_count_t-666,ride_count_t-665,ride_count_t-664,ride_count_t-663,...,ride_count_t-8,ride_count_t-7,ride_count_t-6,ride_count_t-5,ride_count_t-4,ride_count_t-3,ride_count_t-2,ride_count_t-1,hour,start_station_id
0,1,2,1,2,0,0,1,0,1,2,...,5,4,7,5,4,4,5,1,2024-01-29 00:00:00,5905.14
1,0,1,1,0,0,0,0,0,7,10,...,15,14,16,23,7,13,9,12,2024-01-29 23:00:00,5905.14
2,1,4,0,0,0,0,0,5,7,3,...,12,17,20,31,41,25,16,10,2024-01-30 22:00:00,5905.14
3,10,6,3,1,0,0,1,0,0,5,...,17,13,19,20,21,22,16,11,2024-01-31 21:00:00,5905.14
4,14,6,7,1,2,0,2,0,1,2,...,0,0,0,0,0,0,13,19,2024-02-01 20:00:00,5905.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054,28,12,19,15,6,13,16,8,10,11,...,0,3,8,5,10,5,6,5,2024-12-27 12:00:00,6450.05
1055,8,18,21,18,17,19,14,15,17,7,...,2,0,0,1,0,1,0,1,2024-12-28 11:00:00,6450.05
1056,7,15,12,9,14,11,26,14,16,10,...,0,1,0,0,1,3,3,3,2024-12-29 10:00:00,6450.05
1057,16,10,12,8,8,16,18,16,21,35,...,3,0,0,0,6,11,15,20,2024-12-30 09:00:00,6450.05


In [23]:
features_copy = features.copy()

In [24]:
features_targets = features.copy()
features_targets["target"] = targets

features_targets.shape

(1059, 675)

In [25]:
from datetime import datetime, timedelta  
import pandas as pd  
from src.data_utils import split_time_series_data  

# Define the cutoff date as 28 days before today  
cutoff_date = pd.Timestamp(datetime.now() - timedelta(days=28))  
# # Convert the cutoff date to a string in a specific format (e.g., 'YYYY-MM-DD')  
# cutoff_date_str = cutoff_date.strftime('%Y-%m-%d')
# # Print the string representation of the cutoff date  
# print(f"Cutoff date as string: {cutoff_date_str}")
cutoff_date

Timestamp('2025-04-13 08:52:06.856873')

In [26]:
X_train, y_train, X_test, y_test = split_time_series_data(
    features_targets,
    cutoff_date=cutoff_date,
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1059, 674)
(1059,)
(0, 674)
(0,)


In [27]:
from src.pipeline_utils import get_pipeline
pipeline = get_pipeline()

In [28]:
pipeline.fit(features, targets)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42241
[LightGBM] [Info] Number of data points in the train set: 1059, number of used features: 675
[LightGBM] [Info] Start training from score 17.186025


In [29]:
from sklearn.metrics import mean_absolute_error
predictions = pipeline.predict(features)

In [30]:
test_mae = mean_absolute_error(targets, predictions)
print(f"{test_mae:.4f}")

0.4309


In [31]:
import joblib  


# Save the pipeline  
joblib.dump(pipeline, config.MODELS_DIR / "lgb_model.pkl")

['/Users/yashmathur/Documents/MS_DS/Python_Spring_25/CDA_500/Test_Final/models/lgb_model.pkl']

In [32]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(features)
output_schema = Schema(targets)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [33]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="citi_bike_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor",
    input_example=features.sample(),
    model_schema=model_schema
    #include_files=["src/", "requirements.txt"]  # Include directories and files
)
# https://community.hopsworks.ai/t/attributeerror-windowspath-object-has-no-attribute-startswith/1003
# model.save(config.MODELS_DIR / 'lgb_model.pkl')
model.save('//Users//yashmathur//Documents//MS_DS//Python_Spring_25//CDA_500//Test_Final//models//lgb_model.pkl')

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading //Users//yashmathur//Documents//MS_DS//Python_Spring_25//CDA_500//Test_Final//models//lgb_model.pkl:…

Uploading /Users/yashmathur/Documents/MS_DS/Python_Spring_25/CDA_500/Test_Final/notebooks/input_example.json: …

Uploading /Users/yashmathur/Documents/MS_DS/Python_Spring_25/CDA_500/Test_Final/notebooks/model_schema.json: 0…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1231009/models/citi_bike_demand_predictor_next_hour/4


Model(name: 'citi_bike_demand_predictor_next_hour', version: 4)

In [34]:
from src.inference import load_model_from_registry

model = load_model_from_registry()

2025-05-11 08:52:22,186 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-11 08:52:22,193 INFO: Initializing external client
2025-05-11 08:52:22,194 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-11 08:52:22,688 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1231009


Downloading: 0.000%|          | 0/324120 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... DONE

In [35]:
from src.inference import get_model_predictions

preds = get_model_predictions(model, features_copy)
preds

Unnamed: 0,start_station_id,predicted_demand
0,5905.14,3.0
1,5905.14,3.0
2,5905.14,4.0
3,5905.14,9.0
4,5905.14,17.0
...,...,...
1054,6450.05,11.0
1055,6450.05,4.0
1056,6450.05,4.0
1057,6450.05,18.0


In [36]:
test_mae = mean_absolute_error(targets, preds["predicted_demand"])
print(f"{test_mae:.4f}")

0.3286
