# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [None]:
dyf = glueContext.create_dynamic_frame.from_catalog(database='database_name', table_name='table_name')
dyf.printSchema()

#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [None]:
df = dyf.toDF()
df.show()

#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [None]:
s3output = glueContext.getSink(
  path="s3://bucket_name/folder_name",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="demo", catalogTableName="populations"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(DyF)

In [2]:
%pip install --upgrade aws-glue-sessions%session_id_prefix fg-offline-scale
%glue_version 3.0
%idle_timeout 480
%number_of_workers 10
%worker_type G.2X 
%additional_python_modules 'sagemaker,sagemaker-feature-store-pyspark-3.1'
%extra_jars 's3://roymark-aws-ml/spark-connector-jars/sagemaker-feature-store-spark-sdk.jar'
%%configure
{
    "--enable-spark-ui": "true",
    "--spark-event-logs-path": "s3://roymark-aws-ml/gis-spark-logs/",
    "--enable-auto-scaling": "true",
    "--enable-metrics": "true",
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Defaulting to user installation because normal site-packages is not writeable
Collecting aws-glue-sessions
  Downloading aws_glue_sessions-0.37.2-py2.py3-none-any.whl (171 kB)
[K     |████████████████████████████████| 171 kB 30.9 MB/s eta 0:00:01
Installing collected packages: aws-glue-sessions
Successfully installed aws-glue-sessions-0.37.2
Note: you may need to restart the kernel to use updated packages.


In [11]:
%session_id_prefix fg-offline-scale
%glue_version 3.0
%idle_timeout 480
%number_of_workers 10
%worker_type G.2X 
%additional_python_modules 'sagemaker,sagemaker-feature-store-pyspark-3.1'
%extra_jars 's3://chime-fs-demo/spark-connector-jars/sagemaker-feature-store-spark-sdk.jar'
%%configure
{
    "--enable-spark-ui": "true",
    "--spark-event-logs-path": "s3://chime-fs-demo/gis-spark-logs/",
    "--enable-auto-scaling": "true",
    "--enable-metrics": "true",
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
}

Setting session ID prefix to fg-offline-scale
Setting Glue version to: 3.0
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 480 minutes.
Previous number of workers: 5
Setting new number of workers to: 10
Previous worker type: G.1X
Setting new worker type to: G.2X
Additional python modules to be included:
sagemaker
sagemaker-feature-store-pyspark-3.1
Extra jars to be included:
s3://chime-fs-demo/spark-connector-jars/sagemaker-feature-store-spark-sdk.jar
The following configurations have been updated: {'--enable-spark-ui': 'true', '--spark-event-logs-path': 's3://roymark-aws-ml/gis-spark-logs/', '--enable-auto-scaling': 'true', '--enable-metrics': 'true', '--enable-continuous-cloudwatch-log': 'true', '--enable-continuous-log-filter': 'true'}
s3://chime-fs-demo/spark-connector-jars/sagemaker-feature-store-spark-sdk.jar


In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import boto3
import json
import time
import pandas as pd
import sagemaker
from sagemaker.feature_store.feature_group import FeatureDefinition, FeatureGroup, FeatureTypeEnum, DataCatalogConfig
from pyspark.sql import SparkSession
from feature_store_pyspark.FeatureStoreManager import FeatureStoreManager
import feature_store_pyspark
from sagemaker.feature_store.inputs import TableFormatEnum

Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::461312420708:role/Admin
Trying to create a Glue session for the kernel.
Worker Type: G.2X
Number of Workers: 10
Session ID: 195c277a-141c-421d-afa4-d1c26098a5d0
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.0
--enable-glue-datacatalog true
--additional-python-modules sagemaker,sagemaker-feature-store-pyspark-3.1
--extra-jars s3://chime-fs-demo/spark-connector-jars/sagemaker-feature-store-spark-sdk.jar
--enable-spark-ui true
--spark-event-logs-path s3://roymark-aws-ml/gis-spark-logs/
--enable-auto-scaling true
--enable-metrics true
--enable-continuous-cloudwatch-log true
--enable-continuous-log-filter true
Waiting for session 195c277a-141c-421d-afa4-d1c26098a5d0 to get into ready status...
Session 195c277a-141c-421d-afa4-d1c26098a5d0 has been created.



In [2]:
TABLE_FORMAT = TableFormatEnum.ICEBERG

FG_NAME = f'cc_train_fg'

EVENT_TIME_NAME = 'tid'
RECORD_ID_NAME = 'datetime'

# SRC_PREFIX = 'smfs-tmp-source-data'




In [3]:
def get_table_name(feature_group_name):
    featurestore_table = sagemaker_session.describe_feature_group(feature_group_name)['OfflineStoreConfig']['DataCatalogConfig']['TableName']
    return featurestore_table

def get_offline_store_s3_uri(feature_group_name):
    offline_store_s3_uri = sagemaker_session.describe_feature_group(feature_group_name)['OfflineStoreConfig']['S3StorageConfig']['ResolvedOutputS3Uri']
    return offline_store_s3_uri




In [5]:
sagemaker_session = sagemaker.Session()
role = 'arn:aws:iam::461312420708:role/Admin'
default_bucket = 'chime-fs-demo'
# feature_store_manager= FeatureStoreManager()




In [6]:
def ingest_data_to_feature_store(dataframe, feature_group_name, sagemaker_session, target_stores):
    feature_group_arn = sagemaker_session.describe_feature_group(feature_group_name)['FeatureGroupArn']
    feature_store_manager.ingest_data(input_data_frame=dataframe, feature_group_arn= feature_group_arn, 
                                      target_stores=target_stores)
    print(f'Process - ingest_to_feature_store - {feature_group_name} : Completed')




In [7]:
offline_feature_store_uri = f's3://{default_bucket}/sagemaker-feature-store'

print(f'Location of offline store: {offline_feature_store_uri}')

Location of offline store: s3://chime-fs-demo/sagemaker-feature-store


In [8]:
spark = SparkSession.builder.getOrCreate()




In [9]:
full_df = spark.read.csv("s3://chime-fs-demo/aggregated/part-00000-1bc78c44-62fe-4425-8ac9-9b9a367da961-c000.csv",header ="True")




In [10]:
full_df.show(n=5)

+--------------------+--------------------+----------------+-------+-----------+------------------+----------------+-----------------+------------------+--------------------+--------------------+------------------+
|                 tid|            datetime|          cc_num| amount|fraud_label|num_trans_last_10m|avg_amt_last_10m|num_trans_last_1w|   avg_amt_last_1w|          amt_ratio1|          amt_ratio2|       count_ratio|
+--------------------+--------------------+----------------+-------+-----------+------------------+----------------+-----------------+------------------+--------------------+--------------------+------------------+
|d621c8d794262ad5e...|2020-01-01T20:52:...|4006080197832643|8911.09|          0|                 1|         8911.09|                1|           8911.09|                 1.0|                 1.0|               1.0|
|daa28b6f0e729f485...|2020-01-02T00:51:...|4006080197832643|  65.15|          0|                 1|           65.15|                2|      

In [14]:
train_df = full_df.select('tid','datetime','fraud_label', 'amount', 'amt_ratio1','amt_ratio2','count_ratio')




In [40]:
train_filtered_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in train_df.columns]
   ).show()

+---+--------+-----------+------+----------+----------+-----------+
|tid|datetime|fraud_label|amount|amt_ratio1|amt_ratio2|count_ratio|
+---+--------+-----------+------+----------+----------+-----------+
|  0|       0|          0|     0|         0|         0|          0|
+---+--------+-----------+------+----------+----------+-----------+


In [28]:
train_df.na.drop().show(truncate=False)

+--------------------------------+------------------------+-----------+-------+---------------------+---------------------+--------------------+
|tid                             |datetime                |fraud_label|amount |amt_ratio1           |amt_ratio2           |count_ratio         |
+--------------------------------+------------------------+-----------+-------+---------------------+---------------------+--------------------+
|d621c8d794262ad5e8ad804cb4517395|2020-01-01T20:52:20.000Z|0          |8911.09|1.0                  |1.0                  |1.0                 |
|daa28b6f0e729f48563ee7ea945f910c|2020-01-02T00:51:07.000Z|0          |65.15  |0.014516100282523642 |0.014516100282523642 |0.5                 |
|c4f86514d36cf92be555c122f2faae57|2020-01-02T01:08:17.000Z|0          |4865.93|1.0545882618115514   |1.0545882618115514   |0.3333333333333333  |
|0758642b10c11900f42b880c9cb1527b|2020-01-02T09:50:42.000Z|0          |11.92  |0.0034415829549252243|0.0034415829549252243|0.25   

In [29]:
train_df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in train_df.columns]).show()

+---+--------+-----------+------+----------+----------+-----------+
|tid|datetime|fraud_label|amount|amt_ratio1|amt_ratio2|count_ratio|
+---+--------+-----------+------+----------+----------+-----------+
|  1|       1|          1|     1|         1|         1|          0|
+---+--------+-----------+------+----------+----------+-----------+


In [37]:
train_filtered_df = train_df.filter(train_df.tid.isNotNull())




In [38]:
train_filtered_df.dtypes

[('tid', 'string'), ('datetime', 'string'), ('fraud_label', 'string'), ('amount', 'string'), ('amt_ratio1', 'string'), ('amt_ratio2', 'string'), ('count_ratio', 'string')]


In [16]:
fg = FeatureGroup(name=FG_NAME, sagemaker_session=sagemaker_session)




In [43]:
sample_df = pd.DataFrame([['d621c8d794262ad5e8ad804cb4517395', '2023-04-02T19:53:45.483Z', 1,8911.09, 1.0,1.0,1.0]], 
                  columns=['tid', 'datetime', 'fraud_label', 'amount', 'amt_ratio1','amt_ratio2','count_ratio'])


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

cast_object_to_string(sample_df) 
sample_df['fraud_label'] = sample_df['fraud_label'].astype('string')
sample_df['amount'] = sample_df['amount'].astype('string')
sample_df['amt_ratio1'] = sample_df['amt_ratio1'].astype('string')
sample_df['amt_ratio2'] = sample_df['amt_ratio2'].astype('string')
sample_df['count_ratio'] = sample_df['count_ratio'].astype('string')




In [44]:
fg.load_feature_definitions(data_frame=sample_df)

[FeatureDefinition(feature_name='tid', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='datetime', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='fraud_label', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='amount', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='amt_ratio1', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='amt_ratio2', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='count_ratio', feature_type=<FeatureTypeEnum.STRING: 'String'>)]


In [45]:
fg.create(record_identifier_name = RECORD_ID_NAME,
                event_time_feature_name = EVENT_TIME_NAME,
                role_arn = role,
                s3_uri = offline_feature_store_uri,
                enable_online_store = True,
                table_format = TABLE_FORMAT)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:461312420708:feature-group/cc_train_fg', 'ResponseMetadata': {'RequestId': '45a81490-7386-4250-a6ab-887e9e293d41', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '45a81490-7386-4250-a6ab-887e9e293d41', 'content-type': 'application/x-amz-json-1.1', 'content-length': '88', 'date': 'Wed, 05 Apr 2023 15:19:39 GMT'}, 'RetryAttempts': 0}}


In [None]:
feature_group_arn ='arn:aws:sagemaker:us-east-1:461312420708:feature-group/cc_train_fg'
feature_store_manager= FeatureStoreManager()
feature_store_manager.ingest_data(input_data_frame=train_filtered_df, feature_group_arn= feature_group_arn, target_stores=["OfflineStore"])