In [25]:
# !pip install pyspark

In [186]:
import os
import re
import sys
import math
import json
import time
import warnings
import boto3
import botocore
import sagemaker
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timezone, date
from IPython.display import Image
from IPython.display import display
from IPython.display import FileLink, FileLinks
from platformdirs import site_config_dir, user_config_dir
from time import gmtime, strftime

from sagemaker import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run, load_run
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.spark.processing import PySparkProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.remote_function import remote, RemoteExecutor
from sagemaker.tuner import (
    CategoricalParameter, ContinuousParameter,
    HyperparameterTuner, IntegerParameter,
)
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_store import FeatureStore
from sagemaker.feature_store.inputs import FeatureParameter, TableFormatEnum
from sagemaker.feature_store.feature_definition import StringFeatureDefinition
# from sagemaker.feature_store.feature_processor import CSVDataSource, feature_processor, to_pipeline

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [187]:
print(pd.__version__)
print(sklearn.__version__)
print(sagemaker.__version__)
print(boto3.__version__)

2.2.2
1.4.2
2.215.0
1.34.84


In [3]:
print(os.environ)
# os.environ['SAGEMAKER_JOB_CONDA_ENV']

environ({'REGION_NAME': 'us-east-1', 'HOSTNAME': 'sagemaker-data-scienc-ml-t3-medium-ccb588b5efaf671be41927273f0c', 'HOME': '/root', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/_sagemaker-instance-credentials/7ea481b200984547cac93df38123bfc460ae454c09b1b2a1a09b94b251ef8978', 'PYTHONNOUSERSITE': '0', 'AWS_DEFAULT_REGION': 'us-east-1', 'PATH': '/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/tmp/miniconda3/condabin:/tmp/anaconda3/condabin:/tmp/miniconda2/condabin:/tmp/anaconda2/condabin:/tmp/mambaforge/condabin', 'AWS_ACCOUNT_ID': '891377318910', 'DEBIAN_FRONTEND': 'noninteractive', 'SHELL': '/bin/bash', 'AWS_REGION': 'us-east-1', 'AWS_INTERNAL_IMAGE_OWNER': 'Studio', 'CONDA_DIR': '/opt/.sagemakerinternal/conda', 'PWD': '/root', 'AWS_SAGEMAKER_PYTHONNOUSERSITE': '0', 'SAGEMAKER_LOG_FILE': '/var/log/studio/kernel_gateway.log', 'SAGEMAKER_JOB_CONDA_ENV': 'base', 'SAGEMAKER_INTERNAL_IMAGE_URI': '081325390199.dkr.ecr.us-east-1.amazonaws.com/sagema

In [4]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 30)

warnings.filterwarnings("ignore")

In [68]:
boto_session = boto3.Session()
client_sagemaker = boto_session.client("sagemaker")
client_s3 = boto_session.client("s3")
sagemaker_session = sagemaker.Session()
sagemaker_role = sagemaker.get_execution_role()
bucket_name = sagemaker_session.default_bucket()

In [69]:
%store -r

%store

Stored variables and their in-db values:
bucket_name                       -> 'sagemaker-us-east-1-891377318910'
dataset_raw                       -> 'bank-additional-full.csv'
experiment_name                   -> 'itau-experiment-23-16-11-28'
initialized                       -> True
input_s3_url                      -> 's3://sagemaker-us-east-1-891377318910/workshop/da
loca_transformed_path             -> './data/transformed'
local_prefix                      -> './data/raw'
region_name                       -> 'us-east-1'
s3_data_raw_prefix                -> 'data/raw'
s3_prefix                         -> 'workshop'
sagemaker_role                    -> 'arn:aws:iam::891377318910:role/service-role/SageM


In [12]:
loca_transformed_path = "./data/transformed"
%store loca_transformed_path

Stored 'loca_transformed_path' (str)


# Create an experiment

In [14]:
experiment_name = f"itau-experiment-{strftime('%d-%H-%M-%S', gmtime())}"
print(experiment_name)

itau-experiment-23-16-11-28


In [15]:
%store experiment_name

Stored 'experiment_name' (str)


# Feature engineering

- selecao de atributos
- criacao e extracao de atributos
- transformacao de atributos
- codificacao de atributos categoricos
- tratamento de valores ausentes
- deteccao e tratamento de outliers
- normalizacao ou padronizacao
- tratamento de dados desbalanceados

## Option 1:

### Load dataset

In [19]:
local_dataset_path = f"{local_prefix}/{dataset_raw}"

df_raw = pd.read_csv(local_dataset_path, sep=";")

In [20]:
df_raw

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,383,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,442,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


### Option 1.1

In [17]:
df = df_raw.copy()

# remove unnecessary data
df = df.drop(
    ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
    axis=1,
)

# Indicator variable to capture when pdays takes a value of 999
df["no_previous_contact"] = np.where(df["pdays"] == 999, 1, 0)

# Indicator for individuals not actively employed
df["not_working"] = np.where(np.in1d(df["job"], ["student", "retired", "unemployed"]), 1, 0)

bins = [18, 30, 40, 50, 60, 70, 90]
labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

df['age_range'] = pd.cut(df.age, bins, labels=labels, include_lowest=True)
df = pd.concat([df, pd.get_dummies(df['age_range'], prefix='age', dtype=int)], axis=1)
df.drop('age', axis=1, inplace=True)
df.drop('age_range', axis=1, inplace=True)

scaled_features = ['pdays', 'previous', 'campaign']
df[scaled_features] = MinMaxScaler().fit_transform(df[scaled_features])

# Convert categorical variables to sets of indicators
df = pd.get_dummies(df, dtype=int)

# Replace "y_no" and "y_yes" with a single label column, and bring it to the front
df = pd.concat(
    [
        df["y_yes"].rename(target_col),
        df.drop(["y_no", "y_yes"], axis=1),
    ],
    axis=1,
)

### Split data

In [None]:
# Shuffle and splitting dataset
train_data, validation_data, test_data = np.split(
    df.sample(frac=1, random_state=1729),
    [int(0.7 * len(df)), int(0.9 * len(df))],
)

print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

### Option 1.2

In [None]:
def preprocess(df_data):
    target_col = "y"
    
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0)

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    baseline_data = df_model_data.drop([target_col], axis=1)
    
    print("## Processing complete. Exiting.")
        
    return train_data, validation_data, test_data, baseline_data

In [None]:
%%time

df = df_raw.copy()

# Call the function locally
train_data, validation_data, test_data, baseline_data = preprocess(df)

### Save result

In [26]:
print(loca_transformed_path)

./data/transformed


In [31]:
train_data.to_csv(os.path.join(loca_transformed_path, "train.csv"), index=False, header=False)
validation_data.to_csv(os.path.join(loca_transformed_path, "validation.csv"), index=False, header=False)
test_data.to_csv(os.path.join(loca_transformed_path, "test.csv"), index=False, header=False)

## Option 2:

### Load data

In [None]:
local_dataset_path = f"{local_prefix}/{dataset_raw}"
df_raw = pd.read_csv(local_dataset_path, sep=";")

In [None]:
df = df_raw.copy()

In [None]:
df

In [None]:
# s3_data_transformed_prefix = "data/transformed"

# train_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/train"
# validation_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/validation"
# test_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/test"
# baseline_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/baseline"

In [None]:
def preprocess(df_data):
    target_col = "y"
    
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0)

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    baseline_data = df_model_data.drop([target_col], axis=1)
    
    print("## Processing complete. Exiting.")
        
    return train_data, validation_data, test_data, baseline_data

In [None]:
s3_root_uri = f"s3://{bucket_name}/{s3_prefix}"
s3_root_uri

In [None]:
%%writefile requirements.txt
pandas
numpy
scikit-learn

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version=skprocessor_framework_version,
    role=sm_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count, 
    base_job_name='itau-processing',
    sagemaker_session=sagemaker_session,
)

In [None]:
print(f"{processing_instance_type = }")
print(f"{processing_instance_count = }")

In [None]:
# https://sagemaker.readthedocs.io/en/stable/remote_function/sagemaker.remote_function.html#remoteexecutor

with RemoteExecutor(
    dependencies="./requirements.txt",
    s3_root_uri=s3_root_uri,
    sagemaker_session=sagemaker_session,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    max_parallel_jobs=1
) as e:
    future = e.submit(preprocess, df)

In [None]:
train_data, validation_data, test_data, baseline_data = future.result()

In [None]:
train_data.head()

## Option 3:

### Load dataset

In [None]:
local_dataset_path = f"{local_prefix}/{dataset_raw}"
df_raw = pd.read_csv(local_dataset_path, sep=";")

In [None]:
df = df_raw.copy()

In [None]:
print(f"{processing_instance_type = }")
print(f"{processing_instance_count = }")

In [None]:
#Prints the location of the admin config file
print(os.path.join(site_config_dir("sagemaker"), "config.yaml"))

#Prints the location of the user config file
print(os.path.join(user_config_dir("sagemaker"), "config.yaml"))

print(sys.executable)

In [None]:
%%writefile config.yaml

SchemaVersion: '1.0'
SageMaker:
  PythonSDK:
    Modules:
      RemoteFunction:
        Dependencies: './requirements.txt'
        EnableInterContainerTrafficEncryption: true
        EnvironmentVariables: {'EnvVarKey': 'EnvVarValue'}
        IncludeLocalWorkDir: true
        CustomFileFilter: 
          IgnoreNamePatterns:
            - "*.ipynb"
            - "data"
            - "__pycache__"
        InstanceType: 'ml.m5.large'
        JobCondaEnvironment: 'python'

In [None]:
@remote(s3_root_uri=s3_root_uri, instance_type=processing_instance_type)
def preprocess(df_data):
    target_col = "y"
    
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    baseline_data = df_model_data.drop([target_col], axis=1)
    
    print("## Processing complete. Exiting.")
        
    return train_data, validation_data, test_data, baseline_data

In [None]:
run_suffix = strftime('%Y-%m-%M-%S', gmtime())
run_name = f"remote-function-processing-{run_suffix}"

# Create an experiment run and run the function remotely as a SageMaker job
with Run(
    experiment_name=experiment_name,
    run_name=run_name,
    run_display_name="remote-function-processing",
    sagemaker_session=sagemaker_session
) as run:
    train_data, validation_data, test_data, baseline_data = preprocess(df)

In [None]:
train_data.head()

## Option 4:

In [8]:
!aws s3 ls {bucket_name}/{s3_prefix}/{s3_data_raw_prefix}/{dataset_raw} --recursive

2024-06-16 21:13:20    5834924 workshop/data/raw/bank-additional-full.csv


In [24]:
%%writefile preprocessing.py

import os
import pandas as pd
import numpy as np
import argparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder


def _parse_args():
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    return parser.parse_known_args()


def process_data(df_data):
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )
    
    return df_model_data


if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()
    target_col = "y"
    
    # process data
    df_model_data = process_data(pd.read_csv(os.path.join(args.filepath, args.filename), sep=";"))

    # Shuffle and splitting dataset
    # train_data, validation_data, test_data = np.split(
    #     df_model_data.sample(frac=1, random_state=1729),
    #     [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    # )
    
    # print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    # Save datasets locally
    # train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=False)
    # validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=False)
    # test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=False)
    # test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=False)
    
    # Save the baseline dataset for model monitoring
    # df_model_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=False, header=False)
    
    train_data, temp_data = train_test_split(df_model_data, test_size=0.3, random_state=42, shuffle=True)
    validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True)

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    # Save datasets locally
    train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=True)
    validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=True)
    test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=True)
    test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=True)
    
    # Save the baseline dataset for model monitoring
    df_model_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=False, header=True)
    
    print("## Processing complete. Exiting.")

Overwriting preprocessing.py


In [25]:
s3_data_transformed_prefix = "data/transformed"

train_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/train"
validation_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/validation"
test_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/test"
baseline_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/baseline"

In [26]:
print(f"{train_s3_url = }")
print(f"{validation_s3_url = }")
print(f"{test_s3_url = }")
print(f"{baseline_s3_url = }")

train_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/train'
validation_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/validation'
test_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/test'
baseline_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/baseline'


In [11]:
%store train_s3_url
%store validation_s3_url
%store test_s3_url
%store baseline_s3_url

Stored 'train_s3_url' (str)
Stored 'validation_s3_url' (str)
Stored 'test_s3_url' (str)
Stored 'baseline_s3_url' (str)


### Create a experiment run

In [20]:
run_suffix = strftime('%Y-%m-%M-%S', gmtime())
run_name = f"preprocessing-{run_suffix}"

with Run(experiment_name=experiment_name,
         run_name=run_name,
         run_display_name="preprocessing",
         sagemaker_session=sagemaker_session
    ) as run:
    run.log_parameters(
        {
            "train": 0.7,
            "validate": 0.2,
            "test": 0.1
        }
    )
   
    experiment_config = run.experiment_config
    time.sleep(5)

### Create a processor

In [21]:
skprocessor_framework_version = "1.2-1"
processing_instance_type = "ml.m5.large"
processing_instance_count = 1

In [27]:
sklearn_processor = SKLearnProcessor(
    framework_version=skprocessor_framework_version,
    role=sm_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count, 
    base_job_name='itau-processing',
    sagemaker_session=sagemaker_session,
)

processing_inputs = [
        ProcessingInput(
            source=input_s3_url, 
            destination="/opt/ml/processing/input",
            s3_input_mode="File",
            s3_data_distribution_type="ShardedByS3Key"
        )
    ]

processing_outputs = [
        ProcessingOutput(
            output_name="train_data", 
            source="/opt/ml/processing/output/train",
            destination=train_s3_url,
        ),
        ProcessingOutput(
            output_name="validation_data", 
            source="/opt/ml/processing/output/validation", 
            destination=validation_s3_url
        ),
        ProcessingOutput(
            output_name="test_data", 
            source="/opt/ml/processing/output/test", 
            destination=test_s3_url
        ),
        ProcessingOutput(
            output_name="baseline_data", 
            source="/opt/ml/processing/output/baseline", 
            destination=baseline_s3_url
        ),
    ]

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [28]:
try:
    sklearn_processor.run(
        inputs=processing_inputs,
        outputs=processing_outputs,
        code='preprocessing.py',
        wait=True,
        experiment_config=experiment_config,
        # arguments = ['arg1', 'arg2'],
    )

except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e

INFO:sagemaker:Creating processing-job with name itau-processing-2024-06-17-23-33-55-180


............[34mData split > train:(28831, 65) | validation:(6178, 65) | test:(6179, 65)[0m
[34m## Processing complete. Exiting.[0m



In [29]:
# If you set wait to False in the previous code cell, wait until the job completes
while client_sagemaker.describe_processing_job(ProcessingJobName=sklearn_processor._current_job_name)["ProcessingJobStatus"] != "Completed":
    time.sleep(10)
    print(f"Wait until {sklearn_processor._current_job_name} completed")
print("Finished!")

Finished!


In [30]:
# list the uploaded files
!aws s3 ls {bucket_name}/{s3_prefix}/{s3_data_transformed_prefix} --recursive

2024-06-17 23:36:06    6010885 workshop/data/transformed/baseline/baseline.csv
2024-06-17 23:36:06     901714 workshop/data/transformed/test/test_x.csv
2024-06-17 23:36:06      12360 workshop/data/transformed/test/test_y.csv
2024-06-17 23:36:06    4266258 workshop/data/transformed/train/train.csv
2024-06-17 23:36:06     914747 workshop/data/transformed/validation/validation.csv


## Option 5:

In [21]:
!aws s3 ls {bucket_name}/{s3_prefix}/{s3_data_raw_prefix}/{dataset_raw} --recursive

2024-06-16 21:13:20    5834924 workshop/data/raw/bank-additional-full.csv


In [None]:
s3_prefix_fs = 'feature-store'
feature_group_name = 'itau-feature-group'

In [201]:
%%writefile preprocessing_fs.py

import os
import sys
import time
import subprocess
import pandas as pd
import numpy as np
import argparse
import boto3
# import sagemaker
from datetime import datetime, timezone, date
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
# from sagemaker import get_execution_role
# from sagemaker.feature_store.feature_group import FeatureGroup
# from sagemaker.feature_store.inputs import FeatureParameter, TableFormatEnum
# from sagemaker.feature_store.feature_definition import StringFeatureDefinition


class FeatureGroupHandler:
    
    def __init__(self, sm_session, name: str, sm_role):
        self.sagemaker_session = sm_session
        self.client_sm = self.create_client_sm_fs(sm_session)
        self.name = name
        self.sm_role = sm_role
        self._feature_group = FeatureGroup(name=name, sagemaker_session=sm_session)

    @property
    def feature_group(self):
        return self._feature_group

    def create_client_sm_fs(self, sm_session):
        return sm_session.boto_session.client(
            service_name="sagemaker-featurestore-runtime",
            region_name=sm_session.boto_session.region_name
        )

    def check_if_exist(self) -> bool:
        feature_groups = client_sagemaker.search(Resource="FeatureGroup")
        fg_names = [ft['FeatureGroup']['FeatureGroupName'] for ft in feature_groups['Results']]
        return self.name in fg_names

    def get_or_create_by_df(self, df: pd.DataFrame, bucket: str, prefix: str, ft_id_name: str, ft_time_name: str = "event_time"):
        # self._feature_group = FeatureGroup(name=self.name, sagemaker_session=self.sagemaker_session)
        if not check_if_exist():
            self._feature_group.load_feature_definitions(data_frame=df)
            self.create(bucket, prefix, ft_id_name, ft_time_name)
            self.wait_for_creation_complete()

    def create(self, bucket: str, prefix: str, ft_id_name: str, ft_time_name: str):
        self._feature_group.create(
            s3_uri=f's3://{bucket}/{prefix}/{self.name}', 
            record_identifier_name=ft_id_name, 
            event_time_feature_name=ft_time_name, 
            role_arn=self.sm_role, 
            enable_online_store=False,
            table_format=TableFormatEnum.ICEBERG 
        )

    def wait_for_creation_complete(self):
        status = self._feature_group.describe().get('FeatureGroupStatus')
        while status == 'Creating':
            print(f'Waiting for feature group: {self._feature_group.name} to be created ...')
            time.sleep(5)
            status = self._feature_group.describe().get('FeatureGroupStatus')
        if status != 'Created':
            raise SystemExit(f'Failed to create feature group {self._feature_group.name}: {status}')
        print(f'FeatureGroup {self._feature_group.name} was successfully created.')

    def update_feature(self, ft_name: str, ft_desc: str):
        self._feature_group.update_feature_metadata(
            feature_name=ft_name,
            description=ft_desc,
            # feature_additions=[StringFeatureDefinition(ft_name)]
            # parameter_additions=[FeatureParameter(key="idType", value="primarykey")]
        )

    def ingest_data(self, df: pd.DataFrame, max_workers: int = 10, wait: bool = True):
        self._feature_group.ingest(
            data_frame=df,
            max_workers=max_workers,
            wait=wait
        )

    def get_description(self):
        return self._feature_group.describe()

    def get_feature_description(self, ft_name: str):
        return self._feature_group.describe_feature_metadata(ft_name)

    def get_record(self, ft_name):
        return self.sagemaker_session.boto_session.client(
            service_name="sagemaker-featurestore-runtime",
            region_name=self.sagemaker_session.boto_session.region_name
        ).get_record(
            FeatureGroupName=self.name,
            RecordIdentifierValueAsString=str(ft_name)
        )

    def delete(self):
        self._feature_group.delete()


def _parse_args():
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    parser.add_argument('--bucket_fs', type=str, default='sagemaker-us-east-1-891377318910')
    parser.add_argument('--prefix_fs', type=str, default='workshop/feature-store')
    return parser.parse_known_args()

def generate_event_timestamp():
    naive_dt = datetime.now()
    aware_dt = naive_dt.astimezone()
    utc_dt = aware_dt.astimezone(timezone.utc)
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time

def convert_col_name(col):
    return col.replace('.', '_').replace('-', '_').rstrip('_')

def process_data(df_data):
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )
    
    return df_model_data


if __name__=="__main__":
    args, _ = _parse_args()
    target_col = "y"
    
    region = boto3.Session().region_name
    if region is None:
        region = "us-east-1"
    boto_session = boto3.Session(region_name=region)
    os.environ['AWS_DEFAULT_REGION'] = region
    
    subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.135.0"])
    import sagemaker
    from sagemaker import get_execution_role
    from sagemaker.session import Session
    from sagemaker.feature_store.feature_group import FeatureGroup
    from sagemaker.feature_store.inputs import FeatureParameter, TableFormatEnum
    from sagemaker.feature_store.feature_definition import StringFeatureDefinition
    
    sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
    featurestore_runtime = boto_session.client(service_name="sagemaker-featurestore-runtime", region_name=region)

    sagemaker_session = Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_featurestore_runtime_client=featurestore_runtime,
    )
    
    # sagemaker_session = sagemaker.Session()
    sagemaker_role = sagemaker.get_execution_role()
    feature_group_name = "itau-feature-group"
    
    # process data
    df_model_data = process_data(pd.read_csv(os.path.join(args.filepath, args.filename), sep=";"))
    
    df_train, df_temp = train_test_split(df_model_data, test_size=0.3, random_state=42)
    df_valid, df_test = train_test_split(df_temp, test_size=0.5, random_state=42) 
    df_model_data['set'] = 'train'
    df_model_data.loc[df_valid.index, 'set'] = 'validation'
    df_model_dataf.loc[df_test.index, 'set'] = 'test'
    
    del df_train
    del df_temp
    del df_valid
    del df_test
    
    df_model_data['event_time'] = generate_event_timestamp()
    df_model_data['record_id'] = [f'R{i}' for i in range(len(df_model_data))]
    df_model_data = df_model_data.rename(columns=convert_col_name)
    df_model_data = df_model_data.convert_dtypes(infer_objects=True, convert_boolean=False)
    df_model_data['record_id'] = df_model_data['record_id'].astype('string')
    df_model_data['event_time'] = df_model_data['event_time'].astype('string')
    df_model_data=df_model_data.astype(str) # Nao faca isso

    feature_group = FeatureGroupHandler(
        sm_session = sagemaker_session,
        name = feature_group_name,
        sm_role = sagemaker_role
    )

    feature_group.get_or_create_by_df(
        df=df_model_data,
        bucket=args.bucket_fs,
        prefix=args.prefix_fs,
        ft_id_name="record_id",
        ft_time_name="event_time"
    )
    
    feature_group.ingest_data(df_model_data)

    # train_data, temp_data = train_test_split(df_model_data, test_size=0.3, random_state=42, shuffle=True)
    # validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True)
    
    # Save datasets locally
    # train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=True)
    # validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=True)
    # test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=True)
    # test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=True)
    
    # Save the baseline dataset for model monitoring
    # df_model_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=False, header=True)
    
    print("## Processing complete. Exiting.")

Writing preprocessing_fs.py


In [202]:
s3_data_transformed_prefix = "data/transformed"

train_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/train"
validation_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/validation"
test_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/test"
baseline_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/baseline"

In [203]:
print(f"{train_s3_url = }")
print(f"{validation_s3_url = }")
print(f"{test_s3_url = }")
print(f"{baseline_s3_url = }")

print(f"{input_s3_url = }")

train_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/train'
validation_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/validation'
test_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/test'
baseline_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/transformed/baseline'
input_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop/data/raw/bank-additional-full.csv'


In [204]:
skprocessor_framework_version = "1.2-1"
processing_instance_type = "ml.m5.large"
processing_instance_count = 1

In [205]:
sklearn_processor = SKLearnProcessor(
    framework_version=skprocessor_framework_version,
    role=sagemaker_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count, 
    base_job_name='itau-processing',
    sagemaker_session=sagemaker_session,
)

processing_inputs = [
        ProcessingInput(
            source=input_s3_url, 
            destination="/opt/ml/processing/input",
            s3_input_mode="File",
            s3_data_distribution_type="ShardedByS3Key"
        )
    ]

processing_outputs = [
        ProcessingOutput(
            output_name="train_data", 
            source="/opt/ml/processing/output/train",
            destination=train_s3_url,
        ),
        ProcessingOutput(
            output_name="validation_data", 
            source="/opt/ml/processing/output/validation", 
            destination=validation_s3_url
        ),
        ProcessingOutput(
            output_name="test_data", 
            source="/opt/ml/processing/output/test", 
            destination=test_s3_url
        ),
        ProcessingOutput(
            output_name="baseline_data", 
            source="/opt/ml/processing/output/baseline", 
            destination=baseline_s3_url
        ),
    ]

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [206]:
try:
    sklearn_processor.run(
        inputs=processing_inputs,
        outputs=processing_outputs,
        code='preprocessing_fs.py',
        wait=True,
        # experiment_config=experiment_config,
        # arguments = ['arg1', 'arg2'],
    )
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e

INFO:sagemaker:Creating processing-job with name itau-processing-2024-06-23-22-59-52-179


..............[34mCollecting sagemaker==2.135.0
  Downloading sagemaker-2.135.0.tar.gz (673 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 673.8/673.8 kB 33.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'[0m
[34mCollecting attrs<23,>=20.3.0 (from sagemaker==2.135.0)
  Downloading attrs-22.2.0-py3-none-any.whl.metadata (13 kB)[0m
[34mCollecting google-pasta (from sagemaker==2.135.0)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)[0m
[34mCollecting protobuf3-to-dict<1.0,>=0.1.5 (from sagemaker==2.135.0)
  Downloading protobuf3-to-dict-0.1.5.tar.gz (3.5 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'[0m
[34mCollecting smdebug_rulesconfig==1.0.1 (from sagemaker==2.135.0)
  Downloading smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl.metadata (943 bytes)[0m
[34mCollecting importlib-metadata<5.0,>=1.4.0 (from sagemaker==2.135.0)
 

In [207]:
while client_sagemaker.describe_processing_job(ProcessingJobName=sklearn_processor._current_job_name)["ProcessingJobStatus"] != "Completed":
    time.sleep(10)
    print(f"Wait until {sklearn_processor._current_job_name} completed")
print("Finished!")

Finished!
