# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


In [9]:
%session_id_prefix fg-offline-scale
%glue_version 3.0
%idle_timeout 480
%number_of_workers 10
%worker_type G.2X 
%additional_python_modules 'sagemaker,sagemaker-feature-store-pyspark-3.1'
%extra_jars 's3://sm-fs-demo/spark-connectors-jars/sagemaker-feature-store-spark-sdk.jar'
%%configure
{
    "--enable-spark-ui": "true",
    "--spark-event-logs-path": "s3://sm-fs-demo/gis-spark-logs/",
    "--enable-auto-scaling": "true",
    "--enable-metrics": "true",
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Setting session ID prefix to fg-offline-scale
Setting Glue version to: 3.0
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 480 minutes.
Previous number of workers: 5
Setting new number of workers to: 10
Previous worker type: G.1X
Setting new worker type to: G.2X
Additional python modules to be included:
sagemaker
sagemaker-feature-store-pyspark-3.1
Extra jars to be included:
s3://sm-fs-demo/spark-connectors-jars/sagemaker-feature-store-spark-sdk.jar
The following configurations have been updated: {'--enable-spark-ui': 'true', '--spark-event-logs-path': 's3://chime-fs-demo/gis-spark-logs/', '--enable-auto-scaling': 'true',

In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import boto3
import json
import time
import pandas as pd
import sagemaker
from sagemaker.feature_store.feature_group import FeatureDefinition, FeatureGroup, FeatureTypeEnum, DataCatalogConfig
from pyspark.sql import SparkSession
from feature_store_pyspark.FeatureStoreManager import FeatureStoreManager
import feature_store_pyspark
from sagemaker.feature_store.inputs import TableFormatEnum

Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::461312420708:role/Admin
Trying to create a Glue session for the kernel.
Worker Type: G.2X
Number of Workers: 10
Session ID: 2ddfb6fa-44e1-40e1-8c01-f9c0fd499404
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.0
--enable-glue-datacatalog true
--additional-python-modules sagemaker,sagemaker-feature-store-pyspark-3.1
--extra-jars s3://sm-fs-demo/spark-connectors-jars/sagemaker-feature-store-spark-sdk.jar
--enable-spark-ui true
--spark-event-logs-path s3://chime-fs-demo/gis-spark-logs/
--enable-auto-scaling true
--enable-metrics true
--enable-continuous-cloudwatch-log true
--enable-continuous-log-filter true
Waiting for session 2ddfb6fa-44e1-40e1-8c01-f9c0fd499404 to get into ready status...
Session 2ddfb6fa-44e1-40e1-8c01-f9c0fd499404 has been created.



In [29]:
TABLE_FORMAT = TableFormatEnum.ICEBERG

FG_NAME = f'cc-transactions-fg'

EVENT_TIME_NAME = 'datetime'
RECORD_ID_NAME = 'tid'




In [3]:
def get_table_name(feature_group_name):
    featurestore_table = sagemaker_session.describe_feature_group(feature_group_name)['OfflineStoreConfig']['DataCatalogConfig']['TableName']
    return featurestore_table

def get_offline_store_s3_uri(feature_group_name):
    offline_store_s3_uri = sagemaker_session.describe_feature_group(feature_group_name)['OfflineStoreConfig']['S3StorageConfig']['ResolvedOutputS3Uri']
    return offline_store_s3_uri




In [22]:
sagemaker_session = sagemaker.Session()
feature_store_manager= FeatureStoreManager()
role = 'arn:aws:iam::461312420708:role/Admin'
default_bucket = 'sm-fs-demo'




In [5]:
def ingest_data_to_feature_store(dataframe, feature_group_name, sagemaker_session, target_stores):
    feature_group_arn = sagemaker_session.describe_feature_group(feature_group_name)['FeatureGroupArn']
    feature_store_manager.ingest_data(input_data_frame=dataframe, feature_group_arn= feature_group_arn, 
                                      target_stores=target_stores)
    print(f'Process - ingest_to_feature_store - {feature_group_name} : Completed')




In [6]:
offline_feature_store_uri = f's3://{default_bucket}/sagemaker-feature-store'

print(f'Location of offline store: {offline_feature_store_uri}')

Location of offline store: s3://sm-fs-demo/sagemaker-feature-store


In [7]:
spark = SparkSession.builder.getOrCreate()




In [8]:
full_df = spark.read.csv("s3://sm-fs-demo/aggregated/part-*.csv",header ="True")




In [9]:
full_df.show(n=5)

+--------------------+--------------------+----------------+------+-----------+------------------+----------------+-----------------+------------------+------------------+------------------+------------------+
|                 tid|            datetime|          cc_num|amount|fraud_label|num_trans_last_10m|avg_amt_last_10m|num_trans_last_1w|   avg_amt_last_1w|        amt_ratio1|        amt_ratio2|       count_ratio|
+--------------------+--------------------+----------------+------+-----------+------------------+----------------+-----------------+------------------+------------------+------------------+------------------+
|9865906a3fc8ffb36...|2020-01-01T08:03:...|4006080197832643| 89.69|          0|                 1|           89.69|                1|             89.69|               1.0|               1.0|               1.0|
|b18b52528c812800f...|2020-01-01T11:23:...|4006080197832643| 57.98|          0|                 1|           57.98|                2|            73.835|0.785264

In [10]:
train_df = full_df.select('tid','datetime','fraud_label', 'amount', 'amt_ratio1','amt_ratio2','count_ratio')




In [10]:
# train_df.na.drop().show(truncate=False)

In [12]:
train_df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in train_df.columns]).show()

+---+--------+-----------+------+----------+----------+-----------+
|tid|datetime|fraud_label|amount|amt_ratio1|amt_ratio2|count_ratio|
+---+--------+-----------+------+----------+----------+-----------+
|  1|       1|          1|     1|         1|         1|          0|
+---+--------+-----------+------+----------+----------+-----------+


In [13]:
train_filtered_df = train_df.filter(train_df.tid.isNotNull())




In [14]:
train_filtered_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in train_df.columns]
   ).show()

+---+--------+-----------+------+----------+----------+-----------+
|tid|datetime|fraud_label|amount|amt_ratio1|amt_ratio2|count_ratio|
+---+--------+-----------+------+----------+----------+-----------+
|  0|       0|          0|     0|         0|         0|          0|
+---+--------+-----------+------+----------+----------+-----------+


In [15]:
train_filtered_df.dtypes

[('tid', 'string'), ('datetime', 'string'), ('fraud_label', 'string'), ('amount', 'string'), ('amt_ratio1', 'string'), ('amt_ratio2', 'string'), ('count_ratio', 'string')]


In [17]:
sample_df = pd.DataFrame([['d621c8d794262ad5e8ad804cb4517395', '2023-04-02T19:53:45.483Z', 1,8911.09, 1.0,1.0,1.0]], 
                  columns=['tid', 'datetime', 'fraud_label', 'amount', 'amt_ratio1','amt_ratio2','count_ratio'])


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

cast_object_to_string(sample_df) 
sample_df['fraud_label'] = sample_df['fraud_label'].astype('string')
sample_df['amount'] = sample_df['amount'].astype('string')
sample_df['amt_ratio1'] = sample_df['amt_ratio1'].astype('string')
sample_df['amt_ratio2'] = sample_df['amt_ratio2'].astype('string')
sample_df['count_ratio'] = sample_df['count_ratio'].astype('string')




In [45]:
fg = FeatureGroup(name=FG_NAME, sagemaker_session=sagemaker_session)
fg.load_feature_definitions(data_frame=sample_df)

[FeatureDefinition(feature_name='tid', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='datetime', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='fraud_label', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='amount', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='amt_ratio1', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='amt_ratio2', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='count_ratio', feature_type=<FeatureTypeEnum.STRING: 'String'>)]


In [46]:
# in case we run this example multiple times, delete the prior FG since we are recreating it
try:
    fg.delete()
except:
    pass




In [31]:
fg.create(record_identifier_name = RECORD_ID_NAME,
                event_time_feature_name = EVENT_TIME_NAME,
                role_arn = role,
                s3_uri = offline_feature_store_uri,
                enable_online_store = False,
                table_format = TABLE_FORMAT)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:461312420708:feature-group/cc-transactions-fg', 'ResponseMetadata': {'RequestId': 'b4159903-afb4-48d6-a4a9-27fb84846d27', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'b4159903-afb4-48d6-a4a9-27fb84846d27', 'content-type': 'application/x-amz-json-1.1', 'content-length': '95', 'date': 'Sun, 09 Apr 2023 20:01:47 GMT'}, 'RetryAttempts': 0}}


In [32]:
from datetime import datetime, timezone, date

def generate_event_timestamp():
    # naive datetime representing local time
    naive_dt = datetime.now()
    # take timezone into account
    aware_dt = naive_dt.astimezone()
    # time in UTC
    utc_dt = aware_dt.astimezone(timezone.utc)
    # transform to ISO-8601 format
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time




In [39]:
fs_query = fg.athena_query()
fs_table = fs_query.table_name

query_string = f'SELECT * FROM "{fs_table}" limit 10'

print(query_string)

query_results= 'athena-results'

output_location = f's3://{default_bucket}/{query_results}/query_results/'

print(f'Athena query output location: \n{output_location}')

SELECT * FROM "cc_transactions_fg_1681070507" limit 10
Athena query output location: 
s3://sm-fs-demo/athena-results/query_results/


In [43]:
fs_query.run(query_string=query_string, output_location=output_location)
fs_query.wait()
query_df = fs_query.as_dataframe()
query_df.head(5)

Empty DataFrame
Columns: [write_time, api_invocation_time, is_deleted, tid, datetime, fraud_label, amount, amt_ratio1, amt_ratio2, count_ratio]
Index: []


In [33]:
generate_event_timestamp()

'2023-04-09T20:02:36.236Z'


In [44]:
print('Spark Ingestion Starttime', generate_event_timestamp())

ingest_data_to_feature_store(train_filtered_df, FG_NAME, sagemaker_session, target_stores=['OfflineStore'])

print('Spark Ingestion Endtime', generate_event_timestamp())

fs_query.run(query_string=query_string, output_location=output_location)
fs_query.wait()

print('Athena query completion time', generate_event_timestamp())

query_df = fs_query.as_dataframe()
query_df.head(5)

Spark Ingestion Starttime 2023-04-09T20:15:24.212Z
Process - ingest_to_feature_store - cc-transactions-fg : Completed
Spark Ingestion Endtime 2023-04-09T20:16:43.124Z
Athena query completion time 2023-04-09T20:16:53.308Z
                    write_time  ... count_ratio
0  2023-04-09 20:15:40.501 UTC  ...    0.038462
1  2023-04-09 20:15:40.501 UTC  ...    0.030303
2  2023-04-09 20:15:40.501 UTC  ...    0.043478
3  2023-04-09 20:15:40.501 UTC  ...    0.040000
4  2023-04-09 20:15:40.501 UTC  ...    0.047619

[5 rows x 10 columns]
