In [1]:
from botocore.client import ClientError
from collections import defaultdict
from faker import Faker
import pandas as pd
import numpy as np
import sagemaker
import datetime
import hashlib
import random
import boto3
import math
import os
import logging
import subprocess, sys
import importlib

In [6]:
# sm_version = sagemaker.__version__
# major, minor, patch = sm_version.split('.')
# if int(major) < 2 or int(minor) < 144:
#     subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.144.0'])
#     importlib.reload(sagemaker)

In [3]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [7]:
faker = Faker()
faker.seed_locale('en_US', 0)

In [8]:
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
faker.seed_instance(SEED)

In [9]:
TOTAL_UNIQUE_TRANSACTIONS = 5400000 # 5.4 Million
TOTAL_UNIQUE_USERS = 10000
TOTAL_UNIQUE_LOCATION=20000

BUCKET = 'chime-fs-demo'

In [39]:
loc_id_1 = faker.location_on_land()
print(loc_id_1[2])

Edremit


In [10]:
def generate_fake_coordinate(n: int) -> list:
    loc_ids = set()
    for _ in range(n):
        loc_id = faker.latlng()
        loc_ids.add(loc_id)
    return list(loc_ids) 
def generate_fake_name(n: int) -> list:
    loc_ids = set()
    for _ in range(n):
        loc_id = faker.name()
        loc_ids.add(loc_id)
    return list(loc_ids) 

In [44]:
# locations = generate_fake_coordinate(TOTAL_UNIQUE_LOCATION)

In [11]:
names = generate_fake_name(TOTAL_UNIQUE_USERS)

In [12]:
len(names[0:9000])

9000

In [13]:
name_cut_list = names[0:9000]

In [14]:
assert len(names) == TOTAL_UNIQUE_USERS 

AssertionError: 

In [15]:
# inspect random sample of credit card numbers 
random.sample(names, 5)

['Thomas Burton', 'Angela Adams', 'Amy Chan', 'Angela Nolan', 'Terry Wright']

In [16]:
type(names[0])

str

In [17]:
from sagemaker.feature_store.feature_group import FeatureGroup
LOCAL_DIR = './data'
BUCKET = 'chime-fs-demo'
PREFIX = 'training'

sagemaker_role = sagemaker.get_execution_role()
s3_client = boto3.Session().client('s3')
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
feature_group_name = 'cc-agg-batch-chime-fg'

In [18]:
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)
sagemaker_runtime = boto_session.client(service_name='sagemaker', region_name=region)
feature_store_session = sagemaker.Session(boto_session=boto_session, 
                                          sagemaker_client=sagemaker_client, 
                                          sagemaker_featurestore_runtime_client=featurestore_runtime)


In [19]:
fg = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)  

In [20]:
query = fg.athena_query()
table = query.table_name

In [21]:
query_string = f'SELECT * FROM "{table}"'
%store query_string
query_string

Stored 'query_string' (str)


'SELECT * FROM "cc_agg_batch_chime_fg_1680474349"'

In [22]:
query_results= 'sagemaker-fs-demo'
output_location = f's3://{BUCKET}/{query_results}/query_results/'
print(f'Athena query output location: \n{output_location}')

Athena query output location: 
s3://chime-fs-demo/sagemaker-fs-demo/query_results/


In [23]:
query.run(query_string=query_string, output_location=output_location)
query.wait()
df = query.as_dataframe()
df.head(5)

Unnamed: 0,cc_num,num_trans_last_1w,avg_amt_last_1w,trans_time,name,write_time,api_invocation_time,is_deleted
0,4079582416101330,26,129.4,1680479000.0,,2023-04-02 23:46:36.597,2023-04-02 23:41:38.000,False
1,4087415155834227,20,421.96,1680479000.0,,2023-04-02 23:46:36.597,2023-04-02 23:41:38.000,False
2,4119791018014823,28,1230.24,1680479000.0,,2023-04-02 23:46:36.597,2023-04-02 23:41:38.000,False
3,4145540560149281,29,460.23,1680479000.0,,2023-04-02 23:46:36.597,2023-04-02 23:41:38.000,False
4,4449167553784891,30,990.43,1680479000.0,,2023-04-02 23:46:36.597,2023-04-02 23:41:39.000,False


In [24]:
new_df = df.drop(['trans_time','write_time','api_invocation_time','is_deleted'],axis=1).head(9000)

In [25]:
import time
current_time_sec = int(round(time.time()))
new_df['trans_time'] = pd.Series([current_time_sec] * len(new_df), dtype="float64")

In [26]:
new_df['name'] = name_cut_list

In [27]:
new_df.head(5)

Unnamed: 0,cc_num,num_trans_last_1w,avg_amt_last_1w,name,trans_time
0,4079582416101330,26,129.4,Scott Fuller,1680792000.0
1,4087415155834227,20,421.96,Danny Mason,1680792000.0
2,4119791018014823,28,1230.24,Joshua Wong,1680792000.0
3,4145540560149281,29,460.23,Gerald Miller,1680792000.0
4,4449167553784891,30,990.43,Michele Crosby,1680792000.0


In [28]:
len(new_df)

9000

In [29]:
new_df['name'] = new_df['name'].astype("str").astype("string")

In [30]:
new_df.dtypes

cc_num                 int64
num_trans_last_1w      int64
avg_amt_last_1w      float64
name                  string
trans_time           float64
dtype: object

In [59]:
# %time
# sagemaker_runtime.update_feature_group(
#     FeatureGroupName=feature_group_name,
#     FeatureAdditions=[
#         {"FeatureName": "name", "FeatureType": "String"}
#     ])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:461312420708:feature-group/cc-agg-batch-chime-fg',
 'ResponseMetadata': {'RequestId': '47c763fa-bcc1-4340-9417-9fc60cad777c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '47c763fa-bcc1-4340-9417-9fc60cad777c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '98',
   'date': 'Thu, 06 Apr 2023 06:49:16 GMT'},
  'RetryAttempts': 0}}

In [119]:
# new_df['trans_time'] = new_df['trans_time'].astype('string')
# new_df['cc_num'] = new_df['cc_num'].astype('string')
# new_df['num_trans_last_1w'] = new_df['num_trans_last_1w'].astype('string')
# new_df['avg_amt_last_1w'] = new_df['avg_amt_last_1w'].astype('string')

In [120]:
new_df.dtypes

cc_num               string
num_trans_last_1w    string
avg_amt_last_1w      string
trans_time           string
name                 string
dtype: object

In [84]:
new_df.isnull().sum()

cc_num               0
num_trans_last_1w    0
avg_amt_last_1w      0
trans_time           0
name                 0
dtype: int64

In [105]:
%%time
# logger.info(f'Ingesting data into feature group: {fg.name} ...')
# fg.ingest(data_frame=new_df, max_workers=3, wait=True)
# logger.info(f'{len(new_df)} customer records ingested into feature group: {fg.name}')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 9.3 µs


In [106]:
sample_feature_group_name ="cc-agg-update-test-1"
sagemaker_session = sagemaker.Session()
sample_feature_group = FeatureGroup(name=sample_feature_group_name, sagemaker_session=sagemaker_session)

In [132]:
sample_df = pd.DataFrame([['4016674905670309', 1.680478898E9, "29", "603.74","Savannah Willis"]], 
                  columns=['cc_num', 'trans_time', 'num_trans_last_1w', 'avg_amt_last_1w', 'name'])


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

cast_object_to_string(sample_df) 

# sample_df['amount'] = sample_df['amount'].astype('string')
# sample_df['amt_ratio1'] = sample_df['amt_ratio1'].astype('string')
# sample_df['amt_ratio2'] = sample_df['amt_ratio2'].astype('string')
# sample_df['count_ratio'] = sample_df['count_ratio'].astype('string')

In [134]:
sample_feature_group.load_feature_definitions(data_frame=sample_df)

[FeatureDefinition(feature_name='cc_num', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='trans_time', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='num_trans_last_1w', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='avg_amt_last_1w', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='name', feature_type=<FeatureTypeEnum.STRING: 'String'>)]

In [133]:
sample_feature_group.delete()

In [135]:
role = sagemaker.get_execution_role()
default_bucket='chime-fs-demo'
prefix = 'chime-fs'
sample_feature_group.create(s3_uri=f's3://{default_bucket}/{prefix}', 
                               record_identifier_name='cc_num', 
                               event_time_feature_name='trans_time', 
                               role_arn=role, 
                               enable_online_store=False)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:461312420708:feature-group/cc-agg-update-test-1',
 'ResponseMetadata': {'RequestId': '0d936cf7-3851-491a-bdab-20dae9d4c5a8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0d936cf7-3851-491a-bdab-20dae9d4c5a8',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Thu, 06 Apr 2023 08:09:22 GMT'},
  'RetryAttempts': 0}}

In [128]:
# def wait_for_feature_group_creation_complete(feature_group):
#     status = feature_group.describe().get('FeatureGroupStatus')
#     print(f'Initial status: {status}')
#     while status == 'Creating':
#         logger.info(f'Waiting for feature group: {feature_group.name} to be created ...')
#         time.sleep(5)
#         status = feature_group.describe().get('FeatureGroupStatus')
#     if status != 'Created':
#         raise SystemExit(f'Failed to create feature group {feature_group.name}: {status}')
#     logger.info(f'FeatureGroup {feature_group.name} was successfully created.')

In [136]:
wait_for_feature_group_creation_complete(sample_feature_group)

Waiting for feature group: cc-agg-update-test-1 to be created ...


Initial status: Creating


Waiting for feature group: cc-agg-update-test-1 to be created ...
Waiting for feature group: cc-agg-update-test-1 to be created ...
FeatureGroup cc-agg-update-test-1 was successfully created.


In [25]:
if sagemaker.__version__ < '2.144.':
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker==2.144.0'])
    importlib.reload(sagemaker)

In [31]:
# !pip install --upgrade sagemaker

In [4]:
logger.info(f'Using SageMaker version: {sagemaker.__version__}')
logger.info(f'Using Pandas version: {pd.__version__}')

Using SageMaker version: 2.144.0
Using Pandas version: 1.5.2


In [5]:
!python --version

Python 3.10.8


In [43]:
sample_feature_group =FeatureGroup('cc-agg-batch-chime-fg')

In [33]:
%%time

logger.info(f'Ingesting data into feature group: {sample_feature_group.name} ...')
sample_feature_group.ingest(data_frame=new_df, max_processes=16, wait=True)
logger.info(f'{len(new_df)} sample records ingested into feature group: {sample_feature_group.name}')

Ingesting data into feature group: cc-agg-batch-chime-fg ...


NameError: name 'sample_df' is not defined

In [34]:
logger.info(f'{len(new_df)} sample records ingested into feature group: {sample_feature_group.name}')

9000 sample records ingested into feature group: cc-agg-batch-chime-fg


In [44]:
featurestore_runtime_client = sagemaker_session.boto_session.client('sagemaker-featurestore-runtime', region_name=region)

In [50]:
cc_num= '4079582416101330'
logger.info(f'ccnum={cc_num}') 

ccnum=4079582416101330


In [57]:
feature_record = featurestore_runtime_client.get_record(FeatureGroupName='cc-agg-batch-chime-fg', 
                                                        RecordIdentifierValueAsString=cc_num)
feature_record

{'ResponseMetadata': {'RequestId': '26246d63-48ce-489e-8200-0ac5235e6bfd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '26246d63-48ce-489e-8200-0ac5235e6bfd',
   'content-type': 'application/json',
   'content-length': '301',
   'date': 'Thu, 06 Apr 2023 22:42:47 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'cc_num', 'ValueAsString': '4079582416101330'},
  {'FeatureName': 'num_trans_last_1w', 'ValueAsString': '26'},
  {'FeatureName': 'avg_amt_last_1w', 'ValueAsString': '129.4'},
  {'FeatureName': 'trans_time', 'ValueAsString': '1680792064.0'},
  {'FeatureName': 'name', 'ValueAsString': 'Scott Fuller'}]}