# Feature Store

TBD

In [1]:
!pip -q install "PyAthena[SQLAlchemy]" sqlalchemy s3fs

In [2]:
import boto3, sagemaker, time
import pandas as pd
from time import gmtime, strftime
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker import get_execution_role
from sqlalchemy import create_engine, text

sess = sagemaker.Session()
region = boto3.Session().region_name
role = get_execution_role()
bucket = sess.default_bucket()

fg_name = "aai540-ids-splitfs-v2-" + strftime("%Y%m%d-%H%M%S", gmtime())

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Fetch Sampled Data from Athena

In [3]:
database_name = "aai540_eda"
athena_results_path = f"s3://{bucket}/athena/staging/"
engine = create_engine(
    f"awsathena+rest://@athena.{region}.amazonaws.com:443/{database_name}",
    connect_args={"s3_staging_dir": athena_results_path, "region_name": region},
)

# pulling Train, Val, and Test splits
query = f"""
SELECT 
    *, 
    CAST(row_number() OVER() AS VARCHAR) AS record_id,
    {time.time()} AS EventTime
FROM {database_name}.split_v2
WHERE data_split IN ('train', 'val', 'test')
"""

df = pd.read_sql(query, engine)
print(f"Loaded {len(df)} rows for Feature Store ingestion.")

Loaded 299782 rows for Feature Store ingestion.


## Define and Create the Feature Group

In [4]:
feature_group = FeatureGroup(name=fg_name, sagemaker_session=sess)

# map pandas dtypes to Feature Store types
feature_group.load_feature_definitions(data_frame=df)

# create with Online Store (for inference) and Offline Store (for training) enabled
feature_group.create(
    s3_uri=f"s3://{bucket}/aai540/feature-store-offline/",
    record_identifier_name="record_id",
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True
)

# wait for completion
def wait_for_fg(fg):
    status = fg.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group...")
        time.sleep(5)
        status = fg.describe().get("FeatureGroupStatus")
    print(f"Feature Group Status: {status}")

wait_for_fg(feature_group)

Waiting for Feature Group...
Waiting for Feature Group...
Waiting for Feature Group...
Waiting for Feature Group...
Feature Group Status: Created


## Ingest the Data

In [5]:
# ingest using parallel workers
feature_group.ingest(data_frame=df, max_workers=5, wait=True)

print(f"Successfully ingested {len(df)} rows into {fg_name}")

Successfully ingested 299782 rows into aai540-ids-splitfs-v2-20260208-213935


## Export Production Holdout Set to S3

In [6]:
# define the S3 path for the production holdout set
prod_s3_path = f"s3://{bucket}/aai540/production_holdout/production_data.csv"

# query to pull only the 'prod' split
query_prod = f"""
SELECT * FROM {database_name}.split_v1
WHERE data_split = 'prod'
"""

print("Fetching production data from Athena...")
df_prod = pd.read_sql(query_prod, engine)

# export to S3
# index=False to keep the data clean for future inference
df_prod.to_csv(prod_s3_path, index=False)

print(f"Successfully exported {len(df_prod)} production rows to:")
print(prod_s3_path)

Fetching production data from Athena...
Successfully exported 1199728 production rows to:
s3://sagemaker-us-east-1-128131109986/aai540/production_holdout/production_data.csv
