# Update FG and Streaming Ingestion

All prior notebooks have been setting up our end to end solution. Now that all those steps are complete, we will update the feature group cc-agg-fg to add a new feature name. In this notebook, we will send credit card transactions to our new input Kinesis stream and show that we can detect fraud. 
<img src="./images/streaming_prediction.png" />

### Recap of what is in place

Here is a recap of what we have done so far:

1. In [notebook 0](./0_prepare_transactions_dataset.ipynb), We generated a synthetic dataset of transactions, including simulated fraud attacks.
2. In [notebook 1](./1_setup.ipynb), we created our two feature groups. In that same notebook, we also created a Kinesis data stream and a Kinesis Data Analytics SQL application that consumes the transaction stream and produces aggregate features. These features are provided in near real time to Lambda, and they look back over a 10 minute window.
3. In [notebook 2](./2_batch_ingestion-chime.ipynb), we used a SageMaker Processing Job to create aggregated features and used them to feed both the training dataset as well as an online feature group. We used Glue interactive session to ingest transaction data to offline feature store.
4. In [notebook 3](./3_train_and_deploy_model-chime.ipynb), we used offline fs and trained and deployed an XGBoost model to detect fraud.
5. In [notebook 4](./4_streaming_predictions-chime.ipynb), we send transaction to feature store in near real time and make prediction fraud/non fraud

## Imports and overall setup

### Imports and initialization

In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
import sagemaker
import boto3
import json
import time
from sagemaker import get_execution_role
import logging
from datetime import datetime, timezone, date

In [None]:
LOCAL_DIR = './data'
BUCKET = 'sm-fs-demo'
PREFIX = 'testing'
STREAM_NAME = 'cc-fs-stream'

s3_client = boto3.Session().client('s3')
region='us-east-1'
kinesis_client = boto3.client('kinesis')
feature_group_name = 'cc-agg-fg'
role = f'arn:aws:iam::{account_id}:role/sm-fs-streaming-agg-stack-SageMakerRole-WU81JV183YQ2'
sm = boto3.Session().client(service_name='sagemaker')
boto_session = boto3.Session(region_name=region)
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
smfs_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime')
sagemaker_runtime = boto_session.client(service_name='sagemaker', region_name=region)

In [None]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [None]:
def generate_event_timestamp():
    # naive datetime representing local time
    naive_dt = datetime.now()
    # take timezone into account
    aware_dt = naive_dt.astimezone()
    # time in UTC
    utc_dt = aware_dt.astimezone(timezone.utc)
    # transform to ISO-8601 format
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time

### Access the transaction test dataset

In [None]:
test_file_path =f'data/test.csv'
test_df = pd.read_csv(test_file_path)
test_df.head()

In [None]:
test_df.head()
print(test_df.shape)

In [None]:
cc_map_df = pd.DataFrame(test_df['cc_num'].drop_duplicates()).reset_index(drop=True)

In [None]:
cc_map_df.head(5)

In [None]:
len(cc_map_df)

In [None]:
from faker import Faker
import random
faker = Faker()
faker.seed_locale('en_US', 0)
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
faker.seed_instance(SEED)
TOTAL_UNIQUE_USERS = 10000

BUCKET = 'chime-fs-demo'
def generate_fake_name(n: int) -> list:
    loc_ids = set()
    for _ in range(n):
        loc_id = faker.name()
        loc_ids.add(loc_id)
    return list(loc_ids)

names1 = generate_fake_name(TOTAL_UNIQUE_USERS)

In [None]:
names2 = names1[0:627]

names = names1 + names2

In [None]:
cc_map_df['name'] = np.array(names)
test_df['cc_num'] = test_df['cc_num'].astype(str)
cc_map_df['cc_num'] = cc_map_df['cc_num'].astype(str)

In [None]:
cc_map_df.dtypes

In [None]:
#Join test dataframe with cc map dataframe to associate name attribute
test_df = test_df.merge(cc_map_df, how='left', on='cc_num')
test_df.head(5)

In [None]:
kinesis_client = boto3.client('kinesis')
stream='cc-fs-upd-stream'
kinesis_client.create_stream(StreamName=stream, ShardCount=1)

In [None]:
stream_arn = kinesis_client.describe_stream(StreamName=stream)['StreamDescription']['StreamARN']

In [None]:
kda_client = boto3.client('kinesisanalytics')

In [None]:
sql_code = 'CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM" (\n' + \
                '"cc_num"              BIGINT,\n' + \
                '"name"              VARCHAR(16),\n' + \
                '"num_trans_last_10m"  SMALLINT,\n' + \
                '"avg_amt_last_10m"    REAL\n);\n\n' + \
            'CREATE OR REPLACE PUMP "STREAM_PUMP" AS\n' + \
            'INSERT INTO "DESTINATION_SQL_STREAM"\n' + \
                'SELECT STREAM "cc_num",  \n' + \
                     ' "name", \n' + \
                    'COUNT(*) OVER LAST_10_MINUTES, \n' + \
                    'AVG("amount") OVER LAST_10_MINUTES\n' + \
                    'FROM "SOURCE_SQL_STREAM_001"\n' + \
                    'WINDOW LAST_10_MINUTES AS (\n' + \
                        'PARTITION BY "cc_num","name" \n' + \
                        'RANGE INTERVAL \'10\' MINUTE PRECEDING);\n'

In [None]:
kda_inputs = [{
                'NamePrefix': 'SOURCE_SQL_STREAM',
                'KinesisStreamsInput': {
                       'ResourceARN': stream_arn,
                       'RoleARN': role
                },
                'InputSchema': {
                      'RecordFormat': {
                          'RecordFormatType': 'JSON',
                          'MappingParameters': {
                              'JSONMappingParameters': {
                                  'RecordRowPath': '$'
                              }
                          },
                      },
                      'RecordEncoding': 'UTF-8',
                      'RecordColumns': [
                          {'Name': 'cc_num',  'Mapping': '$.cc_num',   'SqlType': 'DECIMAL(1,1)'},
                          {'Name': 'name', 'Mapping': '$.name', 'SqlType': 'VARCHAR(16)'},
                          {'Name': 'merchant','Mapping': '$.merchant', 'SqlType': 'VARCHAR(64)'},
                          {'Name': 'amount', 'Mapping': '$.amount', 'SqlType': 'REAL'},
                          {'Name': 'zip_code', 'Mapping': '$.zip_code', 'SqlType': 'INTEGER'}
                      ]
                }
              }                         
             ]

In [None]:
lambda_to_fs_arn = f'arn:aws:lambda:us-east-1:{account_id}:function:StreamingIngestAggNewFeatures'
kda_outputs = [{'LambdaOutput': {'ResourceARN': lambda_to_fs_arn, 'RoleARN': role},
                'Name': 'DESTINATION_SQL_STREAM',
                'DestinationSchema': {'RecordFormatType': 'JSON'}}]

In [None]:
kda_client.create_application(ApplicationName='cc-agg-fs-upd-app', 
                              Inputs=kda_inputs,
                              Outputs=kda_outputs,
                              ApplicationCode=sql_code)

In [None]:
kda_client.start_application(ApplicationName='cc-agg-fs-upd-app',
                             InputConfigurations=[{'Id': '1.1',
                                                   'InputStartingPositionConfiguration': 
                                                     {'InputStartingPosition':'NOW'}}])

## Test out the solution, end to end

### First, a few utility functions

In [None]:
def get_cloudwatch_logs_url(start_time, end_time):
    log_group_name = '/aws/lambda/' + predict_lambda_name 
    # get the latest log stream for our Lambda that makes fraud predictions
    cw_client = boto3.client('logs')
    last_cw_evt = 0
    while last_cw_evt < int(start_test_time * 1000):
        streams = cw_client.describe_log_streams(logGroupName=log_group_name,
                                                 orderBy='LastEventTime',
                                                 descending=True)['logStreams']
        last_cw_evt = streams[0]['lastIngestionTime'] #'lastEventTimestamp']
        latest_stream = str(streams[0]['logStreamName']).replace('/', '$252F').replace('[$LATEST]', '$255B$2524LATEST$255D')
        if last_cw_evt < int(start_test_time * 1000):
            print('waiting for updated log stream...')
            time.sleep(10)

    # produce a valid URL to get to that log stream
    region = boto3.session.Session().region_name
    log_group_escaped = log_group_name.replace('/', '$252F')
    cw_url = f'https://console.aws.amazon.com/cloudwatch/home?region={region}#logsV2:log-groups/log-group/{log_group_escaped}'
    time_filter = f'$26start$3D{int(start_test_time * 1000) - 10000}$26end$3D{int(end_test_time * 1000) + 40000}'
    full_cw_url = f'{cw_url}/log-events/{latest_stream}$3FfilterPattern$3DPrediction+{time_filter}'
    print('Updated log stream is ready.')
    return full_cw_url

In [None]:
def put_to_stream(stream_name, cc_num, name, merchant, amount, zip_code, timestamp):
    
    payload = {
        'cc_num': int(cc_num),
        'name': str(name),
        'merchant': merchant,
        'amount': amount,
        'zip_code': zip_code,
        'trans_ts': timestamp
    }
    ret_status = True
    data = json.dumps(payload)
    print(f'Sending transaction on card: {cc_num}...')
    response = kinesis_client.put_record(StreamName = stream_name,
                                             Data = data,
                                             PartitionKey = 'shard1')
#     print(response)
    if (response['ResponseMetadata']['HTTPStatusCode'] != 200):
        print("ERROR: Kinesis put_record failed: \n{}".format(json.dumps(response)))
        ret_status = False
        
    return ret_status

In [None]:
def simulate_fraud(stream_name, cc_num, name):
    min_wait = 1; max_wait = 2
    for i in range(10):
        random_amt = round(np.random.uniform(1.00, 50.00), 2)
        seconds_to_wait = np.random.uniform(min_wait, max_wait)
        print(f'waiting {seconds_to_wait:.1f} seconds to send trans {i}...')
        time.sleep(seconds_to_wait)
        put_to_stream(stream_name, int(cc_num), name,'Random Corp', random_amt, '03099', time.time())

## Update the Feature Store


In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup
feature_group =FeatureGroup(feature_group_name)
featurestore_runtime_client = sagemaker_session.boto_session.client('sagemaker-featurestore-runtime', region_name='us-east-1')

In [None]:
# logger.info(f'Update feature group: {feature_group.name} at {generate_event_timestamp()}...')

# sagemaker_runtime.update_feature_group(
#     FeatureGroupName=feature_group_name,
#     FeatureAdditions=[
#         {"FeatureName": "name", "FeatureType": "String"}
#     ])

#INSERT TO UPDATED STREAM
cc_nums = test_df.cc_num.unique()[10:14]
start_test_time = time.time() 
STREAM_NAME = stream

logger.info(f'Put records to stream: {STREAM_NAME} at {generate_event_timestamp()}...')
put_to_stream(STREAM_NAME, cc_nums[0], names[0],'Merchant-0', round(np.random.uniform(100, 5000), 2), 'zip-0', time.time())
put_to_stream(STREAM_NAME, cc_nums[0], names[0], 'Merchant-1', round(np.random.uniform(100, 5000), 2), 'zip-1', time.time())
put_to_stream(STREAM_NAME, cc_nums[2], names[2], 'Merchant-2', round(np.random.uniform(100, 5000), 2), 'zip-2', time.time())

logger.info(f'Endtime put NON FRAUD record to: {STREAM_NAME} at {generate_event_timestamp()}...')

print('\nNow simulate a fraud attack...')
fraud_cc_num = cc_nums[3]
simulate_fraud(STREAM_NAME, fraud_cc_num, names[3])

logger.info(f'Endtime put FRAUD record to: {STREAM_NAME} at {generate_event_timestamp()}...')

#VALIDATE RECORD
cc_num= '4997379740995969'
logger.info(f'GET RECORD FOR ccnum={cc_num} at {generate_event_timestamp()}') 

feature_record = featurestore_runtime_client.get_record(FeatureGroupName=feature_group_name, 
                                                        RecordIdentifierValueAsString=cc_num)
print(feature_record)

logger.info(f'GET RECORD OUTPUT FOR ccnum={cc_num} at {generate_event_timestamp()}') 

<img src="./images/update_online_feature_group.png" />

In [None]:
#VALIDATE RECORD
cc_num= '4997379740995969'
logger.info(f'GET RECORD FOR ccnum={cc_num} at {generate_event_timestamp()}') 

feature_record = featurestore_runtime_client.get_record(FeatureGroupName=feature_group_name, 
                                                        RecordIdentifierValueAsString=cc_num)
print(feature_record)

logger.info(f'GET RECORD OUTPUT FOR ccnum={cc_num} at {generate_event_timestamp()}') 

### Results can be seen in the CloudWatch log stream of our Lambda function
The following cell dynamically creates a link to view the results. It waits for the CloudWatch log stream to have the output events from the transactions we just sent. The URL also hones in on the output from the specific timeframe of the transactions.

In [None]:
from IPython.core.display import display, HTML

full_cw_url = get_cloudwatch_logs_url(start_test_time, end_test_time)
display(HTML(f'<b>Review results in this log stream <a target="blank" href="{full_cw_url}">Lambda fraud detection results</a></b>'))

### Feed a stream of transactions [optional]
If you would like to send additional credit card transactions to simulate more input traffic to the feature pipeline, you can pull from the test dataset as shown below. Just pass in how many transactions you want to send, and the max wait time between transactions (in seconds).

In [None]:
import time

def simulate_traffic(df, max_wait, num_trans):
    for i in range(num_trans):
        row = test_df.iloc[i]
        cc_num = row['cc_num']
        name = row['name']
        zip_code = '0'
        merchant = 'A'
        amt = row['amount']
        print(f'cc_num: {cc_num}, amt: {amt}')
        seconds_to_wait = int(np.random.uniform(0.1, max_wait))
        print(f'waiting {seconds_to_wait} seconds to send trans {i}...')
        time.sleep(seconds_to_wait)
        print(f' putting trans with card: {cc_num}, name: {name}, amt: {amt}, zip: {zip_code}, merchant: {merchant}')
        status = put_to_stream(STREAM_NAME,cc_num, name, merchant, amt, zip_code, time.time())
        if (not status):
            print('Error found during write to Kinesis Stream')
            break
        i += 1
        if i > (num_trans -1):
            break

In [None]:
simulate_traffic(test_df, 2, 2)

In [None]:
#VALIDATE RECORD
cc_num= '4006080197832643'
logger.info(f'GET RECORD FOR ccnum={cc_num} at {generate_event_timestamp()}') 

feature_record = featurestore_runtime_client.get_record(FeatureGroupName=feature_group_name, 
                                                        RecordIdentifierValueAsString=cc_num)
print(feature_record)

logger.info(f'GET RECORD OUTPUT FOR ccnum={cc_num} at {generate_event_timestamp()}') 

In [None]:
print('***Delete Iniatiation Time***',generate_event_timestamp())

feature_record = featurestore_runtime_client.delete_record(FeatureGroupName=feature_group_name, 
                                                        RecordIdentifierValueAsString=cc_num,
                                                        EventTime= str(time.time()) )
print('***Insert Start Time***',generate_event_timestamp())

simulate_traffic(test_df, 2, 2)


print('***Insert End Time***',generate_event_timestamp())

time.sleep(1)

feature_record = featurestore_runtime_client.get_record(FeatureGroupName=feature_group_name, 
                                                        RecordIdentifierValueAsString=cc_num)
print('***Read Time***',generate_event_timestamp())

feature_record