In [None]:
import sagemaker
import pandas as pd
import numpy as np
from ag_model import AutoGluonInferenceModel, AutoGluonTabularPredictor
from sagemaker import utils
import os

# Batch Transform

In [None]:
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.session.Session()

In [None]:
!ls -alF model.tar.gz

In [None]:
model_dir = '.'
endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-autogluon-serving-trained-model")
endpoint_name

In [None]:
model_data = sagemaker_session.upload_data(path=os.path.join(model_dir, 'model.tar.gz'), key_prefix=f'{endpoint_name}/models')
model_data

In [None]:
bucket = sagemaker_session.default_bucket()
s3_prefix = 'autogluon_sm/{}'.format(utils.sagemaker_timestamp())
output_path = f"s3://{bucket}/{s3_prefix}/output/"

In [None]:
model = AutoGluonInferenceModel(
    model_data=model_data, 
    role=role, 
    entry_point="tabular_serve-batch.py",
    source_dir="scripts",
    predictor_cls=AutoGluonTabularPredictor,
)

In [None]:
transformer = model.transformer(
    instance_count=1, 
    instance_type="ml.m5.2xlarge",
    strategy="MultiRecord",
    max_payload=6,
    max_concurrent_transforms=1,
    output_path=output_path,
    accept='application/json',  
    assemble_with='Line',
    
)

Remove header and index column

In [None]:
!wget https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv -O data/test.csv

In [None]:
pd.read_csv(f'data/test.csv')[:100].to_csv('data/test_no_header.csv', header=False, index=False)

Upload data to sagemaker session

In [None]:
data_path = os.path.join('data')
test_input = transformer.sagemaker_session.upload_data(path=os.path.join(data_path, f'test_no_header.csv'), key_prefix=s3_prefix)
test_input

In [None]:
transformer.transform(
    test_input, 
    input_filter='$[:13]', 
    split_type='Line', 
    content_type='text/csv',  
    output_filter="$['class']",
)

In [None]:
transformer.wait()

### Download batch transform outputs

In [None]:
!aws s3 cp {transformer.output_path[:-1]}/test_no_header.csv.out .

In [None]:
p = pd.concat([
    pd.read_json('test_no_header.csv.out', orient='index').sort_index().rename(columns={0: 'preds'}),
    pd.read_csv('data/test.csv')[['class']].iloc[:100].rename(columns={'class': 'actual'}),
], axis=1)
p.head()

In [None]:
print(f'{(p.preds==p.actual).astype(int).sum()}/{len(p)} are correct')