In [1]:
!pip install Faker
!pip install --upgrade sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [5]:
from botocore.client import ClientError
from collections import defaultdict
from faker import Faker
import pandas as pd
import numpy as np
import sagemaker
import datetime
import hashlib
import random
import boto3
import math
import os
import logging
import subprocess, sys
import importlib

In [4]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [6]:
logger.info(f'Using SageMaker version: {sagemaker.__version__}')
logger.info(f'Using Pandas version: {pd.__version__}')

Using SageMaker version: 2.145.0
Using Pandas version: 1.5.2


In [7]:
faker = Faker()
faker.seed_locale('en_US', 0)

In [8]:
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
faker.seed_instance(SEED)

In [9]:
TOTAL_UNIQUE_TRANSACTIONS = 5400000 # 5.4 Million
TOTAL_UNIQUE_USERS = 10000

BUCKET = 'sm-fs-demo'

In [10]:
def generate_fake_name(n: int) -> list:
    loc_ids = set()
    for _ in range(n):
        loc_id = faker.name()
        loc_ids.add(loc_id)
    return list(loc_ids) 

In [11]:
names = generate_fake_name(TOTAL_UNIQUE_USERS)

In [12]:
len(names[0:9000])

9000

In [13]:
name_cut_list = names[0:9000]

In [14]:
assert len(name_cut_list) == 9000 

In [15]:
# inspect random sample of credit card numbers 
random.sample(names, 5)

['Kevin Richard',
 'Audrey Kennedy',
 'Jill Reeves',
 'Janet Williams',
 'Peter King']

In [16]:
from sagemaker.feature_store.feature_group import FeatureGroup
LOCAL_DIR = './data'
BUCKET = 'chime-fs-demo'
PREFIX = 'training'

sagemaker_role = sagemaker.get_execution_role()
s3_client = boto3.Session().client('s3')
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
feature_group_name = 'cc-agg-batch-fg'

In [17]:
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)
sagemaker_runtime = boto_session.client(service_name='sagemaker', region_name=region)
feature_store_session = sagemaker.Session(boto_session=boto_session, 
                                          sagemaker_client=sagemaker_client, 
                                          sagemaker_featurestore_runtime_client=featurestore_runtime)


In [18]:
fg = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)  

In [19]:
query = fg.athena_query()
table = query.table_name

In [20]:
query_string = f'SELECT * FROM "{table}"'
%store query_string
query_string

Stored 'query_string' (str)


'SELECT * FROM "cc_agg_batch_fg_1681064532"'

In [21]:
query_results= 'sagemaker-fs-demo'
output_location = f's3://{BUCKET}/{query_results}/query_results/'
print(f'Athena query output location: \n{output_location}')

Athena query output location: 
s3://chime-fs-demo/sagemaker-fs-demo/query_results/


In [22]:
query.run(query_string=query_string, output_location=output_location)
query.wait()
df = query.as_dataframe()
df.head(5)

Unnamed: 0,cc_num,num_trans_last_1w,avg_amt_last_1w,trans_time,write_time,api_invocation_time,is_deleted
0,4076943030561056,31,546.29,1681066000.0,2023-04-09 18:46:39.140,2023-04-09 18:41:40.000,False
1,4105214026004286,19,532.36,1681066000.0,2023-04-09 18:46:39.140,2023-04-09 18:41:40.000,False
2,4147786805560927,36,904.5,1681066000.0,2023-04-09 18:46:39.140,2023-04-09 18:41:40.000,False
3,4340796634658649,28,1842.17,1681066000.0,2023-04-09 18:46:39.140,2023-04-09 18:41:41.000,False
4,4994058547181220,21,1133.7,1681066000.0,2023-04-09 18:46:39.140,2023-04-09 18:41:44.000,False


In [23]:
new_df = df.drop(['trans_time','write_time','api_invocation_time','is_deleted'],axis=1).head(9000)

In [24]:
import time
current_time_sec = int(round(time.time()))
new_df['trans_time'] = pd.Series([current_time_sec] * len(new_df), dtype="float64")

In [25]:
new_df['name'] = name_cut_list

In [26]:
new_df.head(5)

Unnamed: 0,cc_num,num_trans_last_1w,avg_amt_last_1w,trans_time,name
0,4076943030561056,31,546.29,1681145000.0,John Salas
1,4105214026004286,19,532.36,1681145000.0,William Thompson
2,4147786805560927,36,904.5,1681145000.0,Edward Morris
3,4340796634658649,28,1842.17,1681145000.0,Sara Hughes
4,4994058547181220,21,1133.7,1681145000.0,William Smith


In [27]:
len(new_df)

9000

In [28]:
new_df['name'] = new_df['name'].astype("str").astype("string")

In [29]:
new_df.dtypes

cc_num                 int64
num_trans_last_1w      int64
avg_amt_last_1w      float64
trans_time           float64
name                  string
dtype: object

In [30]:
from datetime import datetime, timezone, date

def generate_event_timestamp():
    # naive datetime representing local time
    naive_dt = datetime.now()
    # take timezone into account
    aware_dt = naive_dt.astimezone()
    # time in UTC
    utc_dt = aware_dt.astimezone(timezone.utc)
    # transform to ISO-8601 format
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time

In [31]:
feature_group =FeatureGroup('cc-agg-batch-chime-fg')

In [32]:
logger.info(f'Updating feature group: {feature_group.name} at {generate_event_timestamp()}...')

sagemaker_runtime.update_feature_group(
    FeatureGroupName=feature_group_name,
    FeatureAdditions=[
        {"FeatureName": "name", "FeatureType": "String"}
    ])

Updating feature group: cc-agg-batch-chime-fg at 2023-04-10T16:46:23.911Z...


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:461312420708:feature-group/cc-agg-batch-fg',
 'ResponseMetadata': {'RequestId': '996c7840-ee7c-4d31-a1d5-6ffe6f0cba9b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '996c7840-ee7c-4d31-a1d5-6ffe6f0cba9b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '92',
   'date': 'Mon, 10 Apr 2023 16:46:23 GMT'},
  'RetryAttempts': 0}}

In [35]:
logger.info(f'Ingesting data into feature group: {feature_group.name} at {generate_event_timestamp()}...')
feature_group.ingest(data_frame=new_df, max_processes=16, wait=True)
logger.info(f'{len(new_df)} sample records ingested into feature group: {feature_group.name} at {generate_event_timestamp()}')

Ingesting data into feature group: cc-agg-batch-chime-fg at 2023-04-10T16:46:37.551Z...
9000 sample records ingested into feature group: cc-agg-batch-chime-fg at 2023-04-10T16:46:45.010Z


In [39]:
cc_num= '4079582416101330'
logger.info(f'ccnum={cc_num}') 

featurestore_runtime_client = sagemaker_session.boto_session.client('sagemaker-featurestore-runtime', region_name=region)

feature_record = featurestore_runtime_client.get_record(FeatureGroupName='cc-agg-batch-chime-fg', 
                                                        RecordIdentifierValueAsString=cc_num)
feature_record

{'ResponseMetadata': {'RequestId': 'fbe622b8-bd13-4ecf-b0fd-a88033622ab2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fbe622b8-bd13-4ecf-b0fd-a88033622ab2',
   'content-type': 'application/json',
   'content-length': '306',
   'date': 'Mon, 10 Apr 2023 16:47:22 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'cc_num', 'ValueAsString': '4079582416101330'},
  {'FeatureName': 'num_trans_last_1w', 'ValueAsString': '24'},
  {'FeatureName': 'avg_amt_last_1w', 'ValueAsString': '577.03'},
  {'FeatureName': 'trans_time', 'ValueAsString': '1681144925.0'},
  {'FeatureName': 'name', 'ValueAsString': 'Clarence Schultz'}]}

In [133]:
# feature_group.delete()