# Sagemaker Studio

## Bring your own container

In [None]:
# !pip install -q \
#     sagemaker \
#     sagemaker-studio-image-build \
#     sagemaker-core

In [None]:
# !sh build_and_push_studio.sh

In [2]:
import sys
import IPython
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.workflow.parameters import Parameter
from sagemaker.workflow.execution_variables import ExecutionVariable

In [None]:
# !{sys.executable} -m pip install -U sagemaker smdebug sagemaker-studio-image-build
# IPython.Application.instance().kernel.do_shutdown(True)

In [6]:
ACCOUNT_ID = boto3.client("sts").get_caller_identity().get("Account")
REGION = boto3.session.Session().region_name
URI_SUFFIX = "amazonaws.com"
ECR_REPOSITORY = "custom-image-teste1"
TAG = "latest"

byoc_image_uri = "{}.dkr.ecr.{}.{}/{}:{}".format(ACCOUNT_ID, REGION, URI_SUFFIX, ECR_REPOSITORY, TAG)
print(f"IMAGE: {byoc_image_uri}")

IMAGE: 891377318910.dkr.ecr.us-east-1.amazonaws.com/custom-image-teste1:latest


In [None]:
training_input_path = "s3://bucket/prefix/"
base_job_name = "custom-container-test-job"
instance_count = 1
instance_type = "ml.c5.xlarge"
role = get_execution_role()

### Use your own Processing Code

In [None]:
script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=byoc_image_uri,
    role=role,
    instance_count=instance_count,
    instance_type=instance_type
)

In [None]:
%%writefile ./preprocessing.py

import os
import logging
import pandas as pd
import numpy as np
import argparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

logging.basicConfig(
    format="[%(asctime)s] %(levelname)s %(name)s %(filename)s %(funcName)s %(lineno)d: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

def _parse_args():
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    parser.add_argument('--logger_level', type=str, default='INFO')
    return parser.parse_known_args()


def process_data(df_data):
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )
    
    return df_model_data


if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()
    logger = logging.getLogger(__name__)
    logger.setLevel(args.logger_level)
    target_col = "y"
    logger.info("TESTANDO")

    df_input = pd.read_csv(os.path.join(args.filepath, args.filename), sep=";")

    # process data
    df_model_data = process_data(df_input)

    print(f"Data:{df_model_data.shape}")

    # Save datasets locally
    df_model_data.to_csv(os.path.join(args.outputpath, 'base/dataset.csv'), index=False, header=False)
  
    print("## Processing complete. Exiting.")

In [None]:
bucket_name = "sagemaker-us-east-1-891377318910"
s3_prefix = "workshop_v2"
dataset_raw = "bank-additional-full.csv"

s3_input_path = f"s3://{bucket_name}/{s3_prefix}/data/raw/{dataset_raw}"
s3_output_path = f"s3://{bucket_name}/{s3_prefix}/data/transformed/baseline/dataset.csv"
print(f"Input file:  {s3_input_path}")
print(f"Output file: {s3_output_path}")

In [None]:
!aws s3 ls {s3_input_path} --recursive

In [None]:
script_processor.run(
    code="preprocessing.py",
    inputs=[
        ProcessingInput(
            source=s3_input_path,
            destination="/opt/ml/processing/input",
            input_name="input_data",
            s3_input_mode="File",
            s3_data_distribution_type="FullyReplicated",
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output/base",
            destination=s3_output_path,
            output_name="processed_dataset",
        ),
    ]
)

### Build your own Processing

In [None]:
processor = Processor(
    image_uri=byoc_image_uri,
    role=role,
    instance_count=instance_count,
    instance_type=instance_type
)

In [None]:
processor.run(
    job_name="job-run-teste",
    arguments=["arg"],
    wait=True,
    inputs=[
        ProcessingInput(
            source="s3://path/to/my/input-data.csv",
            destination="/opt/ml/processing/input_data",
            input_name="input_data",
            s3_data_type="S3Prefix",                     # Valid options: "ManifestFile" or "S3Prefix"
            s3_input_mode="File",                        # Valid options: "Pipe", "File" or "FastFile"
            s3_data_distribution_type="FullyReplicated", # Valid options: "FullyReplicated" or "ShardedByS3Key"
            # s3_compression_type="snappy",
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/processed_data",
            destination="<s3_uri>",
            output_name="processed_dataset",
            # s3_upload_mode = "",
            # feature_store_output = "",
        )
    ],                     
)