In [None]:
!pip install duckdb --pre --upgrade && pip install --pre pandas==2.*

In [None]:
import duckdb
import time
import boto3
import pandas as pd
import numpy as np
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.inputs import TableFormatEnum
import sagemaker
import subprocess
import importlib
import logging
from datetime import datetime, timedelta

duckdb.__version__

In [None]:
sm_version = sagemaker.__version__
major, minor, patch = sm_version.split('.')
if int(major) < 2 or int(minor) < 125:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.125.0'])
    importlib.reload(sagemaker)

In [None]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
logger.info(f'Using SageMaker version: {sagemaker.__version__}')
logger.info(f'Using Pandas version: {pd.__version__}')

In [None]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
logger.info(f'Default S3 bucket = {default_bucket}')
prefix = 'sagemaker-feature-store'
region = sagemaker_session.boto_region_name


In [None]:
MAX_MEMORY = "45GB" # increase to available python memory -25%
TMP_DIR = "data-v8"
DUCKDB_FILE = f"{TMP_DIR}/taxi.duckdb"
DATA_FOLDER = f"{TMP_DIR}/taxidata" 

# S3 Uploads
AWS_ACCESS_KEY=''
AWS_SECRET_ACCESS_KEY=''
AWS_REGION='eu-west-1'
BUCKET = "ayushman-hops"
session = boto3.Session( aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3')

# HDFS Uploads
HOPS_HOST=''
HOPS_API_KEY=''
HDFS_PATH = "/Projects/testproj/Resources/"


In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}


In [None]:
con = duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY, 'temp_directory': TMP_DIR}) 
con.execute("INSTALL httpfs;")
con.execute("INSTALL parquet;")
con.execute("LOAD httpfs;")
con.execute("LOAD parquet;")
con.execute(f"""
    SET s3_region='{AWS_REGION}';
    SET s3_access_key_id='{AWS_ACCESS_KEY}';
    SET s3_secret_access_key='{AWS_SECRET_ACCESS_KEY}';
    """)

In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get('FeatureGroupStatus')
    print(f'Initial status: {status}')
    while status == 'Creating':
        logger.info(f'Waiting for feature group: {feature_group.name} to be created ...')
        time.sleep(10)
        status = feature_group.describe().get('FeatureGroupStatus')
    if status != 'Created':
        raise SystemExit(f'Failed to create feature group {feature_group.name}: {status}')
    logger.info(f'FeatureGroup {feature_group.name} was successfully created.')

In [None]:
'''
From Documentation:
Amazon SageMaker Feature Store supports the AWS Glue and Apache Iceberg table formats for the offline store. 
You can choose the table format when you’re creating a new feature group.

Using Apache Iceberg for storing features accelerates model development by enabling faster query performance when extracting ML training datasets,
taking advantage of Iceberg table compaction. Depending on the design of your feature groups and their scale, you can experience training query 
performance improvements of 10x to 100x by using this new capability.
'''

table_format_param = 'ICEBERG' # or 'GLUE'
if table_format_param == 'ICEBERG':
    table_format = TableFormatEnum.ICEBERG
else:
    table_format = TableFormatEnum.GLUE

In [None]:
def get_raw_data(sf):
    file_path=f's3://{BUCKET}/taxidata_cleaned/*.parquet'
    limit = sf * 1000000
    raw_data = con.execute(f"SELECT * FROM read_parquet('{file_path}') LIMIT {limit};").df()
    # Add row_id index to raw_data
    raw_data['row_id'] = raw_data.reset_index().index
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    return raw_data

In [None]:
sf = 1

raw_data = get_raw_data(sf)
raw_data

In [None]:
def filter_df_by_ts(df, ts_column, start_date, end_date):
    if ts_column and start_date:
        df = df[df[ts_column] >= start_date]
    if ts_column and end_date:
        df = df[df[ts_column] < end_date]
    return df

def pickup_features_fn(df, ts_column, start_date, end_date):
    df = filter_df_by_ts(df, ts_column, start_date, end_date)
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['window'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.floor('15min').dt.strftime('%Y-%m-%d %H:%M:%S')
    pickup_features = (
        df.groupby(['pu_location_id', 'pu_borough', 'window'])
        .agg(
            mean_fare_window_1h_pickup_zip=('fare_amount', 'mean'),
            count_trips_window_1h_pickup_zip=('fare_amount', 'count')
        )
        .reset_index()
        .rename(columns={'pu_location_id': 'location_id', 'pu_borough': 'borough', 'window': 'ts'})
    )
    pickup_features['row_id'] = pickup_features.reset_index().index
    row_id = pickup_features.pop('row_id')
    pickup_features.insert(0, 'row_id', row_id)
    pickup_features.rename(columns={'row_id':'pu_row_id'}, inplace = True)
    pickup_features.drop('borough', axis=1, inplace=True)

    return pickup_features

def dropoff_features_fn(df, ts_column, start_date, end_date):
    df = filter_df_by_ts(df, ts_column, start_date, end_date)
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['window'] = pd.to_datetime(df['tpep_dropoff_datetime']).dt.floor('30min').dt.strftime('%Y-%m-%d %H:%M:%S')
    dropoff_features = (
        df.groupby(['do_location_id', 'do_borough', 'window'])
        .agg(count_trips_window_30m_dropoff_zip=('do_borough', 'count'))
        .reset_index()
        .rename(columns={'do_location_id': 'location_id', 'do_borough': 'borough', 'window': 'ts'})
    )
    dropoff_features['ts'] = pd.to_datetime(dropoff_features['ts'])
    dropoff_features['dropoff_is_weekend'] = dropoff_features['ts'].dt.dayofweek.isin([5, 6])
    dropoff_features['row_id'] = dropoff_features.reset_index().index
    row_id = dropoff_features.pop('row_id')
    dropoff_features.insert(0, 'row_id', row_id)
    dropoff_features.rename(columns={'row_id':'do_row_id'}, inplace = True)
    dropoff_features.drop('borough', axis=1, inplace=True)

    return dropoff_features

In [None]:
pickup_features = pickup_features_fn(
    df=raw_data,
    ts_column="tpep_pickup_datetime",
    start_date=datetime(2011, 1, 1),
    end_date=datetime(2023, 1, 31),
)

pickup_features['ts'] = pd.to_datetime(pickup_features['ts']).dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')

In [None]:
pickup_features

In [None]:
dropoff_features = dropoff_features_fn(
    df=raw_data,
    ts_column="tpep_dropoff_datetime",
    start_date=datetime(2011, 1, 1),
    end_date=datetime(2023, 1, 31),
)

dropoff_features['ts'] = pd.to_datetime(dropoff_features['ts']).dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
dropoff_features["dropoff_is_weekend"] = dropoff_features["dropoff_is_weekend"].astype(int)

In [None]:
dropoff_features

In [None]:
# Create Pickup PIT FG schema definition

pu_features_schema_df = pickup_features.head(5)
pu_features_schema_df = pu_features_schema_df.astype({'ts': 'string'})
pu_features_schema_df

In [None]:
# Create Dropoff PIT FG schema definition

do_features_schema_df = dropoff_features.head(5)
do_features_schema_df = do_features_schema_df.astype({'ts': 'string'})
do_features_schema_df

In [None]:

pickup_features_group_name = f'pit_pickup_features_{sf}'
pickup_features_group = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)
pickup_features_group.load_feature_definitions(data_frame=pu_features_schema_df)
try:
    pickup_features_group.create(
        s3_uri=f's3://{default_bucket}/{prefix}/pit',
        record_identifier_name='pu_row_id',
        event_time_feature_name='ts',
        role_arn=role,
        enable_online_store=False,
        table_format=table_format
    )
    wait_for_feature_group_creation_complete(pickup_features_group)
except:
    print(f"Feature group {pickup_features_group_name} already exists...")

In [None]:

dropoff_features_group_name = f'pit_dropoff_features_{sf}'
dropoff_features_group = FeatureGroup(
    name=dropoff_features_group_name,
    sagemaker_session=sagemaker_session
)
dropoff_features_group.load_feature_definitions(data_frame=do_features_schema_df)
try:
    dropoff_features_group.create(
        s3_uri=f's3://{default_bucket}/{prefix}/pit',
        record_identifier_name='do_row_id',
        event_time_feature_name='ts',
        role_arn=role,
        enable_online_store=False,
        table_format=table_format
    )
    wait_for_feature_group_creation_complete(dropoff_features_group)
except:
    print(f"Feature group {dropoff_features_group_name} already exists...")

In [None]:
pickup_features_group_name = f'pit_pickup_features_{sf}'
pickup_fg = FeatureGroup(
    name=pickup_features_group_name,
    sagemaker_session=sagemaker_session
)

print(f'Ingesting data into feature group: {pickup_fg.name} ...')
pickup_fg.ingest(data_frame=pickup_features, max_processes=16, wait=True)
print(f'{len(pickup_features)} customer records ingested into feature group: {pickup_fg.name}')

In [None]:

dropoff_features_group_name = f'pit_dropoff_features_{sf}'
dropoff_fg = FeatureGroup(
    name=dropoff_features_group_name,
    sagemaker_session=sagemaker_session
)

print(f'Ingesting data into feature group: {dropoff_fg.name} ...')
dropoff_fg.ingest(data_frame=dropoff_features, max_processes=16, wait=True)
print(f'{len(dropoff_features)} customer records ingested into feature group: {dropoff_fg.name}')

In [None]:
## Helper function to create Spine DF

def transform_spine_df(read_inst_df):
    row_id = read_inst_df.pop('row_id')
    read_inst_df.insert(0, 'row_id', row_id)
    read_inst_df.insert(1, 'pu_row_id', row_id)
    read_inst_df.insert(2, 'do_row_id', row_id)
    read_inst_df.drop('tpep_dropoff_datetime', axis=1, inplace=True)
    # read_inst_df.drop('pu_location_id', axis=1, inplace=True)
    read_inst_df.rename(columns={'pu_location_id': 'location_id'}, inplace=True)
    read_inst_df['location_id'] = read_inst_df['location_id'].astype('int64')
    read_inst_df.drop('do_location_id', axis=1, inplace=True)
    read_inst_df.drop('pu_borough', axis=1, inplace=True)
    read_inst_df.drop('do_borough', axis=1, inplace=True)
    read_inst_df.drop('pu_svc_zone', axis=1, inplace=True)
    read_inst_df.drop('do_svc_zone', axis=1, inplace=True)
    read_inst_df.drop('pu_zone', axis=1, inplace=True)
    read_inst_df.drop('do_zone', axis=1, inplace=True)
    ts = read_inst_df.pop('tpep_pickup_datetime')
    read_inst_df.insert(len(read_inst_df.columns), 'ts', ts)
    print("before: ", read_inst_df['ts'].dtype)
    read_inst_df['ts'] = pd.to_datetime(read_inst_df['ts']).dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
    #read_inst_df['ts'] = read_inst_df['ts'].astype('string')
    print("after:  ", read_inst_df['ts'].dtype)
    return read_inst_df

# Benchmark PIT Correct JOIN

## In-Memory Training Dataset

In [None]:
from sagemaker.feature_store.feature_store import FeatureStore

feature_store = FeatureStore(sagemaker_session=sagemaker_session)

In [None]:
#scale_factor = [1,2,5,10]
scale_factor = [1]

for sf in scale_factor:
    # Create raw data frame for join
    raw_data = get_raw_data(sf)
    read_inst_df = transform_spine_df(raw_data)

    pickup_features_group_name = f'pit_pickup_features_{sf}'
    pickup_fg = FeatureGroup(
        name=pickup_features_group_name,
        sagemaker_session=sagemaker_session
    )

    dropoff_features_group_name = f'pit_dropoff_features_{sf}'
    dropoff_fg = FeatureGroup(
        name=dropoff_features_group_name,
        sagemaker_session=sagemaker_session
    )

    start = time.time()

    dataset_builder = feature_store.create_dataset(
        base=read_inst_df, 
        event_time_identifier_feature_name='ts', 
        record_identifier_feature_name='row_id',
        output_path=f's3://{default_bucket}/{prefix}/pit/training_{sf}'
        )\
        .with_feature_group(
            feature_group=pickup_fg,
            target_feature_name_in_base='location_id',
            included_feature_names=['mean_fare_window_1h_pickup_zip','count_trips_window_1h_pickup_zip']
        )\
        .with_feature_group(
            feature_group=dropoff_fg,
            target_feature_name_in_base='location_id',
            included_feature_names=['count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend']
        )\
        .include_duplicated_records()\
        .point_in_time_accurate_join()\
        .with_number_of_records_from_query_results(number_of_records=limit)\
        .with_number_of_recent_records_by_record_identifier(number_of_recent_records=limit)

    result_df, query = dataset_builder.to_dataframe()

    print(f"time for SF {sf}: {time.time() - start}")
    print(f"Num of rows of training data:\n {result_df.count()}")

In [None]:
query

## Training Dataset to CSV File

In [None]:
scale_factor = [1,2,5,10]

for sf in scale_factor:
    # Create raw data frame for join
    raw_data = get_raw_data(sf)
    read_inst_df = transform_spine_df(raw_data)

    pickup_features_group_name = f'pit_pickup_features_{sf}'
    pickup_fg = FeatureGroup(
        name=pickup_features_group_name,
        sagemaker_session=sagemaker_session
    )

    dropoff_features_group_name = f'pit_dropoff_features_{sf}'
    dropoff_fg = FeatureGroup(
        name=dropoff_features_group_name,
        sagemaker_session=sagemaker_session
    )

    start = time.time()

    dataset_builder = feature_store.create_dataset(
        base=read_inst_df, 
        event_time_identifier_feature_name='ts', 
        record_identifier_feature_name='row_id',
        output_path=f's3://{default_bucket}/{prefix}/pit/training_{sf}'
        )\
        .with_feature_group(
            feature_group=pickup_fg,
            target_feature_name_in_base='location_id',
            included_feature_names=['mean_fare_window_1h_pickup_zip','count_trips_window_1h_pickup_zip']
        )\
        .with_feature_group(
            feature_group=dropoff_fg,
            target_feature_name_in_base='location_id',
            included_feature_names=['count_trips_window_30m_dropoff_zip', 'dropoff_is_weekend']
        )\
        .include_duplicated_records()\
        .point_in_time_accurate_join()\
        .with_number_of_records_from_query_results(number_of_records=limit)\
        .with_number_of_recent_records_by_record_identifier(number_of_recent_records=limit)

    result_df, query = dataset_builder.to_csv_file()

    print(f"time for SF {sf}: {time.time() - start}")