## Ingest Orchestration via SageMaker Processing using Spark Connector

[spark connector](https://aws.amazon.com/about-aws/whats-new/2022/01/amazon-sagemaker-feature-store-connector-apache-spark-batch-data-ingestion/)



In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.spark.processing import PySparkProcessor

In [2]:
# Get your acct id
tmp = !(aws sts get-caller-identity --query Account --output text)
account_id = str(tmp.s)

In [3]:
# variables
instance_configs = {
    '1M':{
        'instance_type': 'ml.m5.4xlarge',
        'instance_count': 8,
    }
}
s3_uri_prefix = 's3://fs-ingest/data/1M/'
feature_group_name = 'ingest-fg-02-08-2022-19-17-20'
run_config = '1M'
region = boto3.Session().region_name
image_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/sagemaker-spark-main-fs-dbg:3.0-cpu-py37-v1.0' # your ecr image
feature_group_arn = f"arn:aws:sagemaker:{region}:{account_id}:feature-group/{feature_group_name}"

#### Application Code using Pyspark to ingest into Feature Store. This is run in the SageMaker Processing Container

In [4]:
%%writefile fs_spark_connector_ingest.py
import os
import sys
import argparse
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from feature_store_pyspark.FeatureStoreManager import FeatureStoreManager
import feature_store_pyspark

def run_pyspark_job(args):
    
    extra_jars = ",".join(feature_store_pyspark.classpath_jars())
    spark = SparkSession.builder \
                        .config("spark.jars", extra_jars) \
                        .getOrCreate()
    
    df = spark.read.options(Header=True).csv(args.s3_uri_prefix)
    
    feature_store_manager= FeatureStoreManager()
    
    feature_store_manager.ingest_data(input_data_frame=df, feature_group_arn=args.feature_group_arn)

    
    print ('done ingesting')
    

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--s3_uri_prefix", type=str, required=True)
    parser.add_argument("--feature_group_arn", type=str, required=True)

    args, _ = parser.parse_known_args()
    return args

if __name__ == '__main__':
    args = parse_args()
    run_pyspark_job(args)

Overwriting fs_spark_connector_ingest.py


### Orchestrate ingestion using SageMaker Processing Job

Logs for sagemaker processing jobs (please update your region in the url) - https://us-east-1.console.aws.amazon.com/sagemaker/home?region=us-east-1#/processing-jobs

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

pyspark_processor = PySparkProcessor(image_uri=image_uri,
                                     role=get_execution_role(),
                                     instance_type=instance_configs[run_config]['instance_type'],
                                     instance_count=instance_configs[run_config]['instance_count'],
                                     env={'AWS_DEFAULT_REGION': boto3.Session().region_name})

pyspark_processor.run(
    submit_app='fs_spark_connector_ingest.py',
    arguments = ['--s3_uri_prefix', s3_uri_prefix,
                 '--feature_group_arn', feature_group_arn],
    logs=False
)