In [1]:
import os
from dotenv import load_dotenv
from taxi_demand_predictor.paths import PARENT_DIR, RAW_DATA_DIR, BRONZE_DATA_DIR, SILVER_DATA_DIR, GOLD_DATA_DIR
from taxi_demand_predictor.data import transform_save_data_into_ts_data, validate_and_save_data
from datetime import datetime
import taxi_demand_predictor.config as cfg
import pandas as pd
from pathlib import Path
import hopsworks

2024-08-16 01:40:12,680 - INFO - Folder "data" ensured at "/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data"
2024-08-16 01:40:12,684 - INFO - Folder "raw" ensured at "/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data/raw"
2024-08-16 01:40:12,689 - INFO - Folder "bronze" ensured at "/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data/bronze"
2024-08-16 01:40:12,692 - INFO - Folder "silver" ensured at "/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data/silver"
2024-08-16 01:40:12,693 - INFO - Folder "gold" ensured at "/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data/gold"
2024-08-16 01:40:12,693 - INFO - Folder "models" ensured at "/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/models"


In [2]:
# Path to the root directory of the project in hopsworks
HOPSWORKS_PROJECT_NAME = 'demand_predictor_borja'

# Load the environment variables
load_dotenv(PARENT_DIR / '.env')    

HOPSWORKS_API_KEY = os.environ.get('HOPSWORKS_API_KEY')

In [3]:

# Define the file paths
file_2023 = Path(f"{SILVER_DATA_DIR}/2023/ts_data_2023.parquet")
file_2024 = Path(f"{SILVER_DATA_DIR}/2024/ts_data_2024.parquet")

# Read the parquet files
df_2023 = pd.read_parquet(file_2023)
df_2024 = pd.read_parquet(file_2024)

# Concatenate the DataFrames
combined_df = pd.concat([df_2023, df_2024], ignore_index=True)
combined_df.drop_duplicates(inplace=True)

# Define the output file path
output_file = Path(f"{SILVER_DATA_DIR}/2023-2024/ts_data_2023-2024.parquet")

# Save the combined DataFrame to a new parquet file
combined_df.to_parquet(output_file)

print(f"Combined parquet file saved to {output_file}")
print(f"Number of rides: {combined_df.shape[0]}")

Combined parquet file saved to /Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data/silver/2023-2024/ts_data_2023-2024.parquet
Number of rides: 3202920


In [4]:
combined_df.head(), combined_df.tail()

(  pickup_time  pickup_location_id  ride_count
 0  2023-01-01                   4        18.0
 1  2023-01-01                   7         3.0
 2  2023-01-01                  12         1.0
 3  2023-01-01                  13        14.0
 4  2023-01-01                  24        20.0,
                 pickup_time  pickup_location_id  ride_count
 3202915 2024-05-31 23:00:00                 176         0.0
 3202916 2024-05-31 23:00:00                   2         0.0
 3202917 2024-05-31 23:00:00                 187         0.0
 3202918 2024-05-31 23:00:00                 206         0.0
 3202919 2024-05-31 23:00:00                  84         0.0)

In [5]:
# Convert 'pickup_time' to Unix timestamp (BIGINT) so that it can be stored
combined_df['pickup_ts'] = pd.to_datetime(combined_df['pickup_time']).astype(int) // 10**9

In [6]:
project = hopsworks.login(project = HOPSWORKS_PROJECT_NAME,
                          api_key_value = HOPSWORKS_API_KEY
                          )

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/933012


In [7]:
feature_store = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.


In [8]:
# FEATURE_GROUP_NAME = 'hourly_features_2023_2024'    
# FEATURE_GROUP_VERSION = 1

In [9]:
feature_group = feature_store.get_or_create_feature_group(
  name = cfg.FEATURE_GROUP_METADATA['name'] , 
  version= cfg.FEATURE_GROUP_METADATA['version'], 
  description='time series hourly features for 2023-2024 cab rides',
  primary_key=['pickup_ts', 'pickup_location_id'], 
  event_time='pickup_ts',
  online_enabled=True,
)

In [10]:
feature_group.save(combined_df, write_options={"wait_for_job": False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/933012/fs/926787/fg/1104205


Uploading Dataframe: 0.00% |          | Rows 0/3202920 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: hourly_features_2023_2024_3_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/933012/jobs/named/hourly_features_2023_2024_3_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x17410ced0>, None)