# Spark Job to load Offline Feature Store
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


In [None]:
%session_id_prefix fg-offline-scale
%glue_version 3.0
%idle_timeout 480
%number_of_workers 10
%worker_type G.2X 
%additional_python_modules 'sagemaker,sagemaker-feature-store-pyspark-3.1'
%extra_jars 's3://sm-fs-demo/spark-connectors-jars/sagemaker-feature-store-spark-sdk.jar'
%%configure
{
    "--enable-spark-ui": "true",
    "--spark-event-logs-path": "s3://sm-fs-demo/gis-spark-logs/",
    "--enable-auto-scaling": "true",
    "--enable-metrics": "true",
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
}

In [None]:
%stop_session

In [None]:
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import boto3
import json
import time
import pandas as pd
import sagemaker
from sagemaker.feature_store.feature_group import FeatureDefinition, FeatureGroup, FeatureTypeEnum, DataCatalogConfig
from pyspark.sql import SparkSession
from feature_store_pyspark.FeatureStoreManager import FeatureStoreManager
import feature_store_pyspark
from sagemaker.feature_store.inputs import TableFormatEnum

In [None]:
TABLE_FORMAT = TableFormatEnum.ICEBERG

FG_NAME = f'cc-transaction-fg'

EVENT_TIME_NAME = 'datetime'
RECORD_ID_NAME = 'tid'

In [None]:
def get_table_name(feature_group_name):
    featurestore_table = sagemaker_session.describe_feature_group(feature_group_name)['OfflineStoreConfig']['DataCatalogConfig']['TableName']
    return featurestore_table

def get_offline_store_s3_uri(feature_group_name):
    offline_store_s3_uri = sagemaker_session.describe_feature_group(feature_group_name)['OfflineStoreConfig']['S3StorageConfig']['ResolvedOutputS3Uri']
    return offline_store_s3_uri

In [None]:
sagemaker_session = sagemaker.Session()
feature_store_manager= FeatureStoreManager()
role = #add role for executing glue job
default_bucket = 'sm-fs-demo'

In [None]:
def ingest_data_to_feature_store(dataframe, feature_group_name, sagemaker_session, target_stores):
    feature_group_arn = sagemaker_session.describe_feature_group(feature_group_name)['FeatureGroupArn']
    feature_store_manager.ingest_data(input_data_frame=dataframe, feature_group_arn= feature_group_arn, 
                                      target_stores=target_stores)
    print(f'Process - ingest_to_feature_store - {feature_group_name} : Completed')

In [None]:
offline_feature_store_uri = f's3://{default_bucket}/sagemaker-feature-store'

print(f'Location of offline store: {offline_feature_store_uri}')

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
full_df = spark.read.csv("s3://sm-fs-demo/aggregated/part-*.csv",header ="True")

In [None]:
full_df.show(n=5)

In [None]:
train_df = full_df.select('tid','cc_num','datetime','fraud_label', 'amount', 'amt_ratio1','amt_ratio2','count_ratio')

In [None]:
# train_df.na.drop().show(truncate=False)

In [None]:
train_df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in train_df.columns]).show()

In [None]:
train_filtered_df = train_df.filter(train_df.tid.isNotNull())

In [None]:
train_filtered_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in train_df.columns]
   ).show()

In [None]:
train_filtered_df.dtypes

In [None]:
sample_df = pd.DataFrame([['d621c8d794262ad5e8ad804cb4517395','4006080197832643', '2023-04-02T19:53:45.483Z', 1,8911.09, 1.0,1.0,1.0]], 
                  columns=['tid','cc_num', 'datetime', 'fraud_label', 'amount', 'amt_ratio1','amt_ratio2','count_ratio'])


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

cast_object_to_string(sample_df) 
sample_df['fraud_label'] = sample_df['fraud_label'].astype('string')
sample_df['amount'] = sample_df['amount'].astype('string')
sample_df['amt_ratio1'] = sample_df['amt_ratio1'].astype('string')
sample_df['amt_ratio2'] = sample_df['amt_ratio2'].astype('string')
sample_df['count_ratio'] = sample_df['count_ratio'].astype('string')

In [None]:
fg = FeatureGroup(name=FG_NAME, sagemaker_session=sagemaker_session)
fg.load_feature_definitions(data_frame=sample_df)

In [None]:
# in case we run this example multiple times, delete the prior FG since we are recreating it
try:
    fg.delete()
except:
    pass

In [None]:
fg.create(record_identifier_name = RECORD_ID_NAME,
                event_time_feature_name = EVENT_TIME_NAME,
                role_arn = role,
                s3_uri = offline_feature_store_uri,
                enable_online_store = False,
                table_format = TABLE_FORMAT)

In [None]:
from datetime import datetime, timezone, date

def generate_event_timestamp():
    # naive datetime representing local time
    naive_dt = datetime.now()
    # take timezone into account
    aware_dt = naive_dt.astimezone()
    # time in UTC
    utc_dt = aware_dt.astimezone(timezone.utc)
    # transform to ISO-8601 format
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time

In [None]:
time.sleep(15)

In [None]:
fs_query = fg.athena_query()
fs_table = fs_query.table_name

query_string = f'SELECT * FROM "{fs_table}" limit 10'

print(query_string)

query_results= 'athena-results'

output_location = f's3://{default_bucket}/{query_results}/query_results/'

print(f'Athena query output location: \n{output_location}')

In [None]:
fs_query.run(query_string=query_string, output_location=output_location)
fs_query.wait()
query_df = fs_query.as_dataframe()
query_df.head(5)

In [None]:
generate_event_timestamp()

In [None]:
print('Spark Ingestion Starttime', generate_event_timestamp())

ingest_data_to_feature_store(train_filtered_df, FG_NAME, sagemaker_session, target_stores=['OfflineStore'])

print('Spark Ingestion Endtime', generate_event_timestamp())

fs_query.run(query_string=query_string, output_location=output_location)
fs_query.wait()

print('Athena query completion time', generate_event_timestamp())

query_df = fs_query.as_dataframe()
query_df.head(5)