In [None]:
!pip install --pre duckdb && pip install --pre pandas==2.*

In [None]:
import duckdb
import time
import boto3
import pandas as pd
import numpy as np
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.inputs import TableFormatEnum
import sagemaker
import subprocess
import importlib
import logging

duckdb.__version__

In [None]:
sm_version = sagemaker.__version__
major, minor, patch = sm_version.split('.')
if int(major) < 2 or int(minor) < 125:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.125.0'])
    importlib.reload(sagemaker)

In [None]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
logger.info(f'Using SageMaker version: {sagemaker.__version__}')
logger.info(f'Using Pandas version: {pd.__version__}')

In [None]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
logger.info(f'Default S3 bucket = {default_bucket}')
prefix = 'sagemaker-feature-store'
region = sagemaker_session.boto_region_name

In [None]:
region

In [None]:
MAX_MEMORY = "45GB" # increase to available python memory -25%
TMP_DIR = "fg-data-v8"
DUCKDB_FILE = f"{TMP_DIR}/taxi.duckdb"
DATA_FOLDER = f"{TMP_DIR}/taxidata" 

# S3 Uploads
AWS_ACCESS_KEY=''
AWS_SECRET_ACCESS_KEY=''
AWS_REGION='us-east-2'
BUCKET = "hopsworks-bench-datasets"
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3')

# HDFS Uploads
HOPS_HOST=''
HOPS_API_KEY=''
HDFS_PATH = "/Projects/testproj/Resources/"


In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}


In [None]:
con = duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY, 'temp_directory': TMP_DIR}) 
con.execute("INSTALL httpfs;")
con.execute("INSTALL parquet;")
con.execute("LOAD httpfs;")
con.execute("LOAD parquet;")
con.execute(f"""
    SET s3_region='{AWS_REGION}';
    SET s3_access_key_id='{AWS_ACCESS_KEY}';
    SET s3_secret_access_key='{AWS_SECRET_ACCESS_KEY}';
    """)

In [None]:
def read_feature_data(limit, offset):
    lim = limit
    off = offset
    query = f'''
        CREATE 
        OR REPLACE VIEW taxidata 
        AS
        SELECT 
            tpep_pickup_datetime,
            pu_location_id,
            pu_borough,
            pu_svc_zone,
            pu_zone
        FROM 
            read_parquet([
                's3://{BUCKET}/taxidata_cleaned/2011.parquet',
                's3://{BUCKET}/taxidata_cleaned/2012.parquet',
                's3://{BUCKET}/taxidata_cleaned/2013.parquet',
                's3://{BUCKET}/taxidata_cleaned/2014.parquet',
                's3://{BUCKET}/taxidata_cleaned/2015.parquet',
                's3://{BUCKET}/taxidata_cleaned/2016.parquet'
            ])
    '''
    con.execute(query)
    raw_data = con.execute(f"SELECT * FROM taxidata LIMIT {lim} OFFSET {off}").df()
    # Add row_id index to raw_data
    raw_data['row_id'] = raw_data.reset_index().index
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    # Event time data type required as String by Sagemaker 
    raw_data['tpep_pickup_datetime'] = pd.to_datetime(raw_data['tpep_pickup_datetime']).dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
    return raw_data

In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get('FeatureGroupStatus')
    print(f'Initial status: {status}')
    while status == 'Creating':
        logger.info(f'Waiting for feature group: {feature_group.name} to be created ...')
        time.sleep(10)
        status = feature_group.describe().get('FeatureGroupStatus')
    if status != 'Created':
        raise SystemExit(f'Failed to create feature group {feature_group.name}: {status}')
    logger.info(f'FeatureGroup {feature_group.name} was successfully created.')

In [None]:
'''
From documentation: 
Amazon SageMaker Feature Store supports the AWS Glue and Apache Iceberg table formats for the offline store. 
You can choose the table format when you’re creating a new feature group.

Using Apache Iceberg for storing features accelerates model development by enabling faster query performance when extracting ML training datasets,
taking advantage of Iceberg table compaction. Depending on the design of your feature groups and their scale, you can experience training query 
performance improvements of 10x to 100x by using this new capability.
'''

table_format_param = 'ICEBERG' # or 'GLUE'
if table_format_param == 'ICEBERG':
    table_format = TableFormatEnum.ICEBERG
else:
    table_format = TableFormatEnum.GLUE

In [None]:
# Create FG
scale_factor = [50]

# read 10 rows into df and use this to load feature definitions
features_schema_df = read_feature_data(10, 0)
features_schema_df = features_schema_df.astype({'tpep_pickup_datetime': 'string'})

for sf in scale_factor:
    pickup_features_group_name = f'pickup_features_{sf}'
    pickup_features_group = FeatureGroup(
        name=pickup_features_group_name,
        sagemaker_session=sagemaker_session
    )
    pickup_features_group.load_feature_definitions(data_frame=features_schema_df)
    try:
        pickup_features_group.create(
            s3_uri=f's3://{default_bucket}/{prefix}/',
            record_identifier_name='row_id',
            event_time_feature_name='tpep_pickup_datetime',
            role_arn=role,
            enable_online_store=False,
            table_format=table_format
        )
        wait_for_feature_group_creation_complete(pickup_features_group)
    except:
        print(f"Feature group {pickup_features_group_name} already exists...")

In [None]:
scale_factor = [50] # Number of millions of rows to scale 
limit = 5000000 # Get 10M at a time so it's faster

for sf in scale_factor:
    offset = 0
    total_rows = sf * 1000000  # Millions
    while offset < total_rows:
        pickup_features_group_name = f'pickup_features_{sf}'
        pickup_fg = FeatureGroup(
            name=pickup_features_group_name,
            sagemaker_session=sagemaker_session
        )
        print(f"Total rows: {total_rows}; Offset: {offset}")
        pickup_features = read_feature_data(limit, offset)
        print(f'Ingesting data into feature group: {pickup_fg.name} ...')
        pickup_fg.ingest(data_frame=pickup_features, max_processes=16, wait=True)
        print(f'{len(pickup_features)} customer records ingested into feature group: {pickup_fg.name}')
        offset += limit

# Benchmark Reads

## In-Memory Training Dataset

In [None]:
from sagemaker.feature_store.feature_store import FeatureStore

feature_store = FeatureStore(sagemaker_session=sagemaker_session)

In [None]:
sf = 5
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

result_df, query = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training_{sf}'
).to_dataframe()

print(f"time for SF {sf}: {time.time() - start}")
print(f"Num of rows of training data:\n {result_df.count()}")

In [None]:
sf = 10
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

result_df, query = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training_{sf}'
).to_dataframe()

print(f"time for SF {sf}: {time.time() - start}")
print(f"Num of rows of training data:\n {result_df.count()}")

In [None]:
sf = 20
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

result_df, query = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training_{sf}'
).to_dataframe()

print(f"time for SF {sf}: {time.time() - start}")
print(f"Num of rows of training data:\n {result_df.count()}")

In [None]:
sf = 50
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

result_df, query = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training_{sf}'
).to_dataframe()

print(f"time for SF {sf}: {time.time() - start}")
print(f"Num of rows of training data:\n {result_df.count()}")

## Training Dataset to Parquet

In [None]:
sf = 5
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

dataset_builder = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training/test_{sf}'
)

result_path, query = dataset_builder.include_duplicated_records().to_csv_file()

print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data:\n {result_path}")

In [None]:
sf = 10
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

dataset_builder = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training/test_{sf}'
)

result_path, query = dataset_builder.include_duplicated_records().to_csv_file()

print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data:\n {result_path}")

In [None]:
sf = 20
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

dataset_builder = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training/test_{sf}'
)

result_path, query = dataset_builder.include_duplicated_records().to_csv_file()

print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data:\n {result_path}")

In [None]:
sf = 50
pickup_features_group_name = f'pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

start = time.time()

dataset_builder = feature_store.create_dataset(
   base=pickup_fg,
   output_path=f's3://{default_bucket}/{prefix}/training/test_{sf}'
)

result_path, query = dataset_builder.include_duplicated_records().to_csv_file()

print(f"time for SF {sf} write to parquet: {time.time() - start}")
print(f"Num of rows of training data:\n {result_path}")