In [33]:
import os
import re
import sys
import math
import json
import time
import warnings
import boto3
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timezone, date
from IPython.display import Image
from IPython.display import display
from IPython.display import FileLink, FileLinks
from platformdirs import site_config_dir, user_config_dir
from time import gmtime, strftime

from sagemaker import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run, load_run
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.spark.processing import PySparkProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.remote_function import remote, RemoteExecutor
from sagemaker.tuner import (
    CategoricalParameter, ContinuousParameter,
    HyperparameterTuner, IntegerParameter,
)
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_store import FeatureStore
from sagemaker.feature_store.inputs import FeatureParameter, TableFormatEnum, DataCatalogConfig
from sagemaker.feature_store.feature_definition import StringFeatureDefinition
# from sagemaker.feature_store.feature_processor import CSVDataSource, feature_processor, to_pipeline

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [34]:
boto_session = boto3.Session()
client_sagemaker = boto_session.client("sagemaker")
client_s3 = boto_session.client("s3")
sagemaker_session = sagemaker.Session()
sagemaker_role = sagemaker.get_execution_role()
bucket_name = sagemaker_session.default_bucket()

In [64]:
class FeatureGroupHandler:
    
    def __init__(self, sm_session, name: str, sm_role):
        self.sagemaker_session = sm_session
        self.client_sm = self.create_client_sm_fs(sm_session)
        self.name = name
        self.sm_role = sm_role
        self.feature_group = None
    
    def create_client_sm_fs(self, sm_session):
        return sm_session.boto_session.client(
            service_name="sagemaker-featurestore-runtime",
            region_name=sm_session.boto_session.region_name
        )

    @property
    def fg(self):
        return self.feature_group

    @property
    def feature_group_exist(self) -> bool:
        feature_groups = client_sagemaker.search(Resource="FeatureGroup")
        fg_names = [ft['FeatureGroup']['FeatureGroupName'] for ft in feature_groups['Results']]
        return self.name in fg_names

    def get_or_create_by_df(self, df: pd.DataFrame, bucket: str, prefix: str, ft_id_name: str, ft_time_name: str = "event_time"):
        self.feature_group = FeatureGroup(name=self.name, sagemaker_session=self.sagemaker_session)
        if not self.feature_group_exist:
            self.feature_group.load_feature_definitions(data_frame=df)
            self.create(bucket, prefix, ft_id_name, ft_time_name)
            self.wait_for_creation_complete()

    def create(self, bucket: str, prefix: str, ft_id_name: str, ft_time_name: str):
        self.feature_group.create(
            s3_uri=f's3://{bucket}/{prefix}/{self.name}', 
            record_identifier_name=ft_id_name, 
            event_time_feature_name=ft_time_name, 
            role_arn=self.sm_role, 
            enable_online_store=False,
            disable_glue_table_creation=False,
            # data_catalog_config=""
        )

    def wait_for_creation_complete(self):
        status = self.feature_group.describe().get('FeatureGroupStatus')
        while status == 'Creating':
            print(f'Waiting for feature group: {self.feature_group.name} to be created ...')
            time.sleep(5)
            status = self.feature_group.describe().get('FeatureGroupStatus')
        if status != 'Created':
            raise SystemExit(f'Failed to create feature group {self.feature_group.name}: {status}')
        print(f'FeatureGroup {self.feature_group.name} was successfully created.')

    def update_feature(self, ft_name: str, ft_desc: str):
        self.feature_group.update_feature_metadata(
            feature_name=ft_name,
            description=ft_desc,
            # feature_additions=[StringFeatureDefinition(ft_name)]
            # parameter_additions=[FeatureParameter(key="idType", value="primarykey")]
        )

    def ingest_data(self, df: pd.DataFrame, max_workers: int = 5, wait: bool = True):
        self.feature_group.ingest(
            data_frame=df,
            max_workers=max_workers,
            wait=wait
        )

    def get_description(self):
        return self.feature_group.describe()

    def get_feature_description(self, ft_name: str):
        return self.feature_group.describe_feature_metadata(ft_name)

    def get_record(self, ft_name):
        return self.sagemaker_session.boto_session.client(
            service_name="sagemaker-featurestore-runtime",
            region_name=self.sagemaker_session.boto_session.region_name
        ).get_record(
            FeatureGroupName=self.name,
            RecordIdentifierValueAsString=str(ft_name)
        )

    def delete(self):
        self.feature_group.delete()


In [54]:
def generate_event_timestamp():
    naive_dt = datetime.now()
    aware_dt = naive_dt.astimezone()
    utc_dt = aware_dt.astimezone(timezone.utc)
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time


def convert_col_name(col):
    return col.replace('.', '_').replace('-', '_').rstrip('_')

## Creating Dataframe

In [37]:
records_fake = {
    'PedidoID': [101, 102, 103, 104, 105, 106],
    'ClienteID': [1, 1, 3, 2, 5, 4],
    'Produto': ['Produto A', 'Produto B', 'Produto C', 'Produto D', 'Produto E', 'Produto F'],
    'Valor': [150.00, 200.00, 250.00, 150.00, 300.00, 350.00]
}

In [38]:
df_fake = pd.DataFrame(records_fake)

In [39]:
df_fake['event_time'] = generate_event_timestamp()
df_fake['record_id'] = [f'R{i}' for i in range(len(df_fake))]

df_fake = df_fake.rename(columns=convert_col_name)
df_fake = df_fake.convert_dtypes(infer_objects=True, convert_boolean=False)
df_fake['record_id'] = df_fake['record_id'].astype('string')
df_fake['event_time'] = df_fake['event_time'].astype('string')

In [40]:
df_fake

Unnamed: 0,PedidoID,ClienteID,Produto,Valor,event_time,record_id
0,101,1,Produto A,150,2024-06-23T23:34:23.815Z,R0
1,102,1,Produto B,200,2024-06-23T23:34:23.815Z,R1
2,103,3,Produto C,250,2024-06-23T23:34:23.815Z,R2
3,104,2,Produto D,150,2024-06-23T23:34:23.815Z,R3
4,105,5,Produto E,300,2024-06-23T23:34:23.815Z,R4
5,106,4,Produto F,350,2024-06-23T23:34:23.815Z,R5


In [47]:
feature_group_name = 'teste-feature-group'
prefix_fg = f"workshop/feature-store/{feature_group_name}"
ft_id_name = "record_id"
ft_time_name = "event_time"

In [65]:
feature_group = FeatureGroupHandler(
    sm_session = sagemaker_session,
    name = feature_group_name,
    sm_role = sagemaker_role
)

In [66]:
feature_group.feature_group_exist

True

In [67]:
feature_group.get_or_create_by_df(df_fake, bucket_name, prefix_fg, ft_id_name, ft_time_name)

In [58]:
feature_group.ingest_data(df_fake)

INFO:sagemaker.feature_store.feature_group:Started ingesting index 2 to 4
INFO:sagemaker.feature_store.feature_group:Started ingesting index 6 to 6
INFO:sagemaker.feature_store.feature_group:Started ingesting index 0 to 2
INFO:sagemaker.feature_store.feature_group:Started ingesting index 6 to 6
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 6 to 6
INFO:sagemaker.feature_store.feature_group:Started ingesting index 4 to 6
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 6 to 6
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 2 to 4
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 0 to 2
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 4 to 6


In [68]:
def get_historical_record_count(fg):
    fs_query = feature_group.fg.athena_query()
    query_string = f'SELECT COUNT(*) FROM "' + fs_query.table_name + f'"'
    output_location =  f's3://{bucket_name}/{prefix_fg}/offline-store/query_results/'
    fs_query.run(query_string=query_string, output_location=output_location)
    fs_query.wait()
    fs_df = fs_query.as_dataframe()
    return fs_df.iat[0, 0]

In [70]:
# Before accessing the feature data you need to check if the offline feature store was populated
record_count = len(df_fake)
offline_store_contents = None

while offline_store_contents is None:    
    fs_record_count = get_historical_record_count(feature_group)
    if fs_record_count >= record_count:
        print(f'[{fs_record_count} feature records are available in offline store for {feature_group.name} feature group]')
        offline_store_contents = fs_record_count
    else:
        print('[Waiting for data arrives in offline store ...]')
        time.sleep(20)

INFO:sagemaker:Query 2e2ce75a-48be-4595-8918-0dae31571b53 is being executed.
INFO:sagemaker:Query 2e2ce75a-48be-4595-8918-0dae31571b53 successfully executed.


[6 feature records are available in offline store for teste-feature-group feature group]


In [None]:
# data_config = DataCatalogConfig(
#     table_name = "",
#     catalog = "",
#     database = ""
# )

## Read csv from S3

In [7]:
input_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/raw/bank-additional-full.csv'

In [12]:
df = pd.read_csv("data/raw/bank-additional-full.csv", sep=";")

In [13]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [18]:
def process_data(df_data: pd.DataFrame, target_col: str):
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )
    
    return df_model_data

In [19]:
target_col = "y"

df_model = process_data(df, target_col)

In [20]:
df_model

Unnamed: 0,y,campaign,pdays,previous,no_previous_contact,not_working,age_18-29,age_30-39,age_40-49,age_50-59,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,0.000000,1.0,0.000000,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1,0,0.000000,1.0,0.000000,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
2,0,0.000000,1.0,0.000000,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,0.000000,1.0,0.000000,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0.000000,1.0,0.000000,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,1,0.000000,1.0,0.000000,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
41184,0,0.000000,1.0,0.000000,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
41185,0,0.018182,1.0,0.000000,1,1,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
41186,1,0.000000,1.0,0.000000,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


In [23]:
df_model['event_time'] = generate_event_timestamp()
df_model['record_id'] = [f'R{i}' for i in range(len(df_model))]
df_model = df_model.rename(columns=convert_col_name)
df_model = df_model.convert_dtypes(infer_objects=True, convert_boolean=False)
df_model['record_id'] = df_model['record_id'].astype('string')
df_model['event_time'] = df_model['event_time'].astype('string')
df_model[target_col] = df_model[target_col].astype('string')

In [29]:
feature_group_name = 'teste-feature-group-2'
prefix_fg = f"workshop/feature-store/{feature_group_name}"
ft_id_name = "record_id"
ft_time_name = "event_time"

In [30]:
feature_group = FeatureGroupHandler(
    sm_session = sagemaker_session,
    name = feature_group_name,
    sm_role = sagemaker_role
)

In [31]:
feature_group.create_by_df(
    df=df_model,
    bucket=bucket_name,
    prefix=prefix_fg,
    ft_id_name=ft_id_name,
    ft_time_name=ft_time_name
)

Waiting for feature group: teste-feature-group-2 to be created ...
Waiting for feature group: teste-feature-group-2 to be created ...
Waiting for feature group: teste-feature-group-2 to be created ...
Waiting for feature group: teste-feature-group-2 to be created ...
FeatureGroup teste-feature-group-2 was successfully created.


In [None]:
# for col in df:
#     if pd.api.types.is_object_dtype(df[col].dtype):
#         df[col] = df[col].astype(pd.StringDtype())

## Outros

In [None]:
def get_historical_record_count(fg):
    fs_query = feature_group.athena_query()
    query_string = f'SELECT COUNT(*) FROM "' + fs_query.table_name + f'"'
    output_location =  f's3://{bucket_name}/{feature_store_bucket_prefix}/offline-store/query_results/'

    fs_query.run(query_string=query_string, output_location=output_location)
    fs_query.wait()
    fs_df = fs_query.as_dataframe()
    
    return fs_df.iat[0, 0]

In [None]:
# Before accessing the feature data you need to check if the offline feature store was populated
offline_store_contents = None

while offline_store_contents is None:    
    fs_record_count = get_historical_record_count(dataset_feature_group)
    if fs_record_count >= record_count:
        print(f'[{fs_record_count} feature records are available in offline store for {dataset_feature_group.name} feature group]')
        offline_store_contents = fs_record_count
    else:
        print('[Waiting for data arrives in offline store ...]')
        time.sleep(60)

In [None]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

s3_client = boto3.client('s3', region_name=region)
sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(service_name="sagemaker-featurestore-runtime",region_name=region)

In [None]:
# Create FeatureStore session object
feature_store_session = sagemaker.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

feature_store = FeatureStore(sagemaker_session=feature_store_session)

In [None]:
included_feature_names = [f.feature_name for f in dataset_feature_group.feature_definitions]

In [None]:
# Create dataset builder to retrieve the most recent version of each record
builder = feature_store.create_dataset(
    base=dataset_feature_group,
    # included_feature_names=included_feature_names,
    output_path=output_location,
).with_number_of_recent_records_by_record_identifier(1)

In [None]:
df_dataset, query = builder.to_dataframe()

In [None]:
df_dataset