In [None]:
%pip install databricks-feature-store && pip install duckdb --pre --upgrade

In [None]:
import duckdb
import time
import numpy as np
import pandas as pd
from databricks.feature_store import FeatureLookup

duckdb.__version__

In [None]:
pd.__version__

In [None]:
from databricks.feature_store import FeatureStoreClient
fs = FeatureStoreClient()

In [None]:
MAX_MEMORY = "40GB" # increase to available python memory -25%
TMP_DIR = "fg-data-v8"
DUCKDB_FILE = f"{TMP_DIR}/taxi.duckdb"
DATA_FOLDER = f"{TMP_DIR}/taxidata" 

# S3 Uploads
AWS_ACCESS_KEY=''
AWS_SECRET_ACCESS_KEY=''
AWS_REGION='us-east-2'
BUCKET = "hopsworks-bench-datasets"


In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}


In [None]:
con = duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY, 'temp_directory': TMP_DIR}) 
con.execute("INSTALL httpfs;")
con.execute("INSTALL parquet;")
con.execute("LOAD httpfs;")
con.execute("LOAD parquet;")
con.execute(f"""
    SET s3_region='{AWS_REGION}';
    SET s3_access_key_id='{AWS_ACCESS_KEY}';
    SET s3_secret_access_key='{AWS_SECRET_ACCESS_KEY}';
    """)

In [None]:
!ls -l {TMP_DIR}

In [None]:
def get_raw_data(sf):
    query = f'''
      SELECT 
          tpep_pickup_datetime,
          trip_distance,
          tip_amount,
          fare_amount
      FROM 
          read_parquet([
            's3://{BUCKET}/taxidata_cleaned/2011.parquet', 
            's3://{BUCKET}/taxidata_cleaned/2012.parquet', 
            's3://{BUCKET}/taxidata_cleaned/2013.parquet',
            's3://{BUCKET}/taxidata_cleaned/2014.parquet',
            's3://{BUCKET}/taxidata_cleaned/2015.parquet',
            's3://{BUCKET}/taxidata_cleaned/2016.parquet'
          ])
      LIMIT {sf*1000000}
    '''
    raw_data = con.execute(query).df()
    # Add row_id index to raw_data
    raw_data['row_id'] = raw_data.reset_index().index
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    return raw_data
  
def read_feature_data(limit, offset):
    lim = limit
    off = offset
    query = f'''
      CREATE 
      OR REPLACE 
      VIEW taxidata 
      AS
      SELECT 
          tpep_pickup_datetime, 
          pu_location_id, 
          pu_borough,
          pu_svc_zone,
          pu_zone 
      FROM 
          read_parquet([
            's3://{BUCKET}/taxidata_cleaned/2011.parquet', 
            's3://{BUCKET}/taxidata_cleaned/2012.parquet', 
            's3://{BUCKET}/taxidata_cleaned/2013.parquet',
            's3://{BUCKET}/taxidata_cleaned/2014.parquet',
            's3://{BUCKET}/taxidata_cleaned/2015.parquet',
            's3://{BUCKET}/taxidata_cleaned/2016.parquet'
          ])
    '''
    con.execute(query)
    raw_data = con.execute(f"SELECT * FROM taxidata LIMIT {lim} OFFSET {off}").df()
    # Add row_id index to raw_data
    raw_data['row_id'] = raw_data.reset_index().index
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    return raw_data

In [None]:
%sql CREATE DATABASE IF NOT EXISTS feature_store_taxi_example;

In [None]:
spark.conf.set("spark.sql.execution.arrow.enabled","true")
spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled","false")

In [None]:
scale_factor = [5,10,20,50,100]
limit = 10000000 # Get 10M at a time so it's faster

for sf in scale_factor:
    offset = 0
    total_rows = sf * 1000000  # Millions
    while offset < total_rows:
      # Get feature data
      pickup_features = read_feature_data(limit, offset)
      print(f"Total rows: {total_rows}; Offset: {offset}")
      # Conver to Spark DF
      sparkDF=spark.createDataFrame(pickup_features)
      # Create table for first update
      if offset == 0:
          fs.create_table(
            name=f"feature_store_taxi_example.pickup_features_{sf}",
            primary_keys=["row_id"],
            timestamp_keys=["tpep_pickup_datetime"],
            df=sparkDF,
            description="NYC Taxi data pickup features",
          )
      else:
        # Merge insert for offset > 0  
        fs.write_table(
          name=f"feature_store_taxi_example.pickup_features_{sf}",
          df=sparkDF,
          mode="merge",
        )
      offset += limit

# Benchmarks FG Reads

# SF = 5

In [None]:
sf = 5
pickup_feature_lookups = [
    FeatureLookup(
        table_name=f"feature_store_taxi_example.pickup_features_{sf}",
        feature_names=[
            "pu_location_id",
            "pu_borough",
            "pu_svc_zone",
            "pu_zone",
        ],
        lookup_key=["row_id"],
        timestamp_lookup_key="tpep_pickup_datetime",
    ),
]
raw_data = get_raw_data(sf)
spark_raw_data = spark.createDataFrame(raw_data)

start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
pdf = training_df.write.mode("overwrite").parquet(f"training-{sf}.parquet")
# toParquet
print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")

# Convert to Pandas and count
start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
training_df.collect()
pdf = training_df.toPandas()
print(f"time for SF {sf} collect pandas dataframe: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")

# SF = 10

In [None]:
sf = 10
pickup_feature_lookups = [
    FeatureLookup(
        table_name=f"feature_store_taxi_example.pickup_features_{sf}",
        feature_names=[
            "pu_location_id",
            "pu_borough",
            "pu_svc_zone",
            "pu_zone",
        ],
        lookup_key=["row_id"],
        timestamp_lookup_key="tpep_pickup_datetime",
    ),
]
raw_data = get_raw_data(sf)
spark_raw_data = spark.createDataFrame(raw_data)

start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
pdf = training_df.write.mode("overwrite").parquet(f"training-{sf}.parquet")
print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")

# Convert to Pandas and count
start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
training_df.collect()
pdf = training_df.toPandas()
print(f"time for SF {sf} collect pandas dataframe: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")

# SF = 20

In [None]:
sf = 20
pickup_feature_lookups = [
    FeatureLookup(
        table_name=f"feature_store_taxi_example.pickup_features_{sf}",
        feature_names=[
            "pu_location_id",
            "pu_borough",
            "pu_svc_zone",
            "pu_zone",
        ],
        lookup_key=["row_id"],
        timestamp_lookup_key="tpep_pickup_datetime",
    ),
]
raw_data = get_raw_data(sf)
spark_raw_data = spark.createDataFrame(raw_data)
spark_raw_data

# Write to parquet
start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
pdf = training_df.toPandas()
pdf = training_df.write.mode("overwrite").parquet(f"training-{sf}.parquet")
print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")

# Convert to Pandas and count
start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
training_df.collect()
pdf = training_df.toPandas()
print(f"time for SF {sf} collect pandas dataframe: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")

# SF = 50

In [None]:
sf = 50
pickup_feature_lookups = [
    FeatureLookup(
        table_name=f"feature_store_taxi_example.pickup_features_{sf}",
        feature_names=[
            "pu_location_id",
            "pu_borough",
            "pu_svc_zone",
            "pu_zone",
        ],
        lookup_key=["row_id"],
        timestamp_lookup_key="tpep_pickup_datetime",
    ),
]
raw_data = get_raw_data(sf)
spark_raw_data = spark.createDataFrame(raw_data)

spark_raw_data.cache()

# Write to parquet
start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
pdf = training_df.toPandas()
pdf = training_df.write.mode("overwrite").parquet(f"training-{sf}.parquet")
print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")

# Convert to Pandas and count
start = time.time()
training_set = fs.create_training_set(
    spark_raw_data,
    feature_lookups=pickup_feature_lookups,
    label="fare_amount",
)
training_df = training_set.load_df()
training_df.collect()
pdf = training_df.toPandas()
print(f"time for SF {sf} collect pandas dataframe: {time.time() - start}")
print(f"Num of rows of training data: {training_df.count()}")