## Ingest Orchestration via SageMaker Processing

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

#### Application Code using SageMaker Python SDK to ingest the pandas dataframe. This is run in the SageMaker Processing Container

In [None]:
%%writefile fs_batch_ingest.py
import pandas as pd
import os
import glob
import subprocess
import sys
import argparse

subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker"])
import sagemaker as sm

from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.session import Session

sm_session = Session()       

def cast_object_to_string(df):
    for col in df.columns:
        if df.dtypes[col] == 'object':
            df[col] = df[col].astype('str')


def ingest_data(args):   

    file_list = glob.glob('/opt/ml/processing/input/*.csv')
    df = pd.concat([pd.read_csv(f) for f in file_list], ignore_index=True)
    cast_object_to_string(df)

    fg = FeatureGroup(name=args.feature_group_name, sagemaker_session=sm_session)
    resp = fg.ingest(data_frame=df, max_processes=args.num_processes, max_workers=args.num_workers, wait=True)
    
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--num_processes", type=int, default=1)
    parser.add_argument("--num_workers", type=int, default=1)
    parser.add_argument("--feature_group_name", type=str, required=True)

    args, _ = parser.parse_known_args()
    return args

if __name__ == '__main__':
    args = parse_args()
    ingest_data(args)

#### Please gather the s3 location for chunked files and the Feature Group Name that is provisioned earlier

In [None]:
s3_uri_prefix = 's3://fs-ingest-2/data/chunks-1M/'
feature_group_name = 'ingest-fg-06-17-2021-14-46-44'

## Sample Config for SageMaker Processing

In [None]:
instance_config = {
        'instance_type': 'ml.m5.4xlarge',
        'instance_count': 5,
        'num_processes': '16',
        'num_workers': '4'
}

### Orchestrate ingestion using SageMaker Processing Job

see logs for sagemaker processing jobs (please update your region in the url) - https://us-west-2.console.aws.amazon.com/sagemaker/home?region=us-west-2#/processing-jobs

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor = SKLearnProcessor(framework_version='0.20.0',role=get_execution_role(),
                             instance_type=instance_config['instance_type'],
                             instance_count=instance_config['instance_count'],
                             env={'AWS_DEFAULT_REGION': boto3.Session().region_name})

sklearn_processor.run(
    code='fs_batch_ingest.py',
    arguments = ['--num_processes', instance_config['num_processes'], 
                 '--num_workers', instance_config['num_workers'],
                 '--feature_group_name', feature_group_name],
    inputs=[ProcessingInput(
        s3_data_type='S3Prefix',
        source=s3_uri_prefix,
        s3_data_distribution_type='ShardedByS3Key',
        destination='/opt/ml/processing/input')],
    logs=False
)