# Feature Engineering

In [1]:
import os
import re
import sys
import math
import json
import time
import warnings
import boto3
import botocore
import sagemaker
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timezone, date
from IPython.display import Image, display
from IPython.display import FileLink, FileLinks
from platformdirs import site_config_dir, user_config_dir
from time import gmtime, strftime
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler

from sagemaker import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run, load_run
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.spark.processing import PySparkProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.remote_function import remote, RemoteExecutor
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_store import FeatureStore
from sagemaker.feature_store.inputs import FeatureParameter, TableFormatEnum
from sagemaker.feature_store.feature_definition import StringFeatureDefinition
# from sagemaker.feature_store.feature_processor import CSVDataSource, feature_processor, to_pipeline
from sagemaker.tuner import (
    CategoricalParameter, ContinuousParameter,
    HyperparameterTuner, IntegerParameter,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
print("pandas:", pd.__version__)
print("sklearn:", sklearn.__version__)
print("sagemaker:", sagemaker.__version__)
print("boto3:", boto3.__version__)

pandas: 2.1.4
sklearn: 1.4.2
sagemaker: 2.214.3
boto3: 1.34.51


In [5]:
print(os.environ)
# os.environ['SAGEMAKER_JOB_CONDA_ENV']

environ({'SHELL': '/bin/bash', 'SUPERVISOR_GROUP_NAME': 'jupyterlabserver', 'MAMBA_USER_ID': '57439', 'SAGEMAKER_SPACE_NAME': 'jupyter-env', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/_sagemaker-instance-credentials/1672af7708e92945a3c701ea0dea923c9e4727c53856769ae12c4f92815f0674', 'ENV_NAME': 'base', 'MAMBA_USER': 'sagemaker-user', 'SUPERVISOR_SERVER_URL': 'unix:///var/run/supervisord/supervisor.sock', 'HOSTNAME': 'default', 'SAGEMAKER_APP_TYPE_LOWERCASE': 'jupyterlab', 'SAGEMAKER_LOG_FILE': '/var/log/studio/jupyterlab.log', 'AWS_DEFAULT_REGION': 'us-east-1', 'XML_CATALOG_FILES': 'file:///opt/conda/etc/xml/catalog file:///etc/xml/catalog', 'EDITOR': 'nano', 'AWS_REGION': 'us-east-1', 'PWD': '/home/sagemaker-user', 'GSETTINGS_SCHEMA_DIR': '/opt/conda/share/glib-2.0/schemas', 'CONDA_PREFIX': '/opt/conda', 'REGION_NAME': 'us-east-1', 'MAMBA_ROOT_PREFIX': '/opt/conda', 'GSETTINGS_SCHEMA_DIR_CONDA_BACKUP': '', 'AWS_INTERNAL_IMAGE_OWNER': 'jupyterlab', 'HOME': '/home/sagemaker-user', 'LANG

In [3]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

warnings.filterwarnings("ignore")

## Preparacao

In [4]:
def save_file(file_name: str, content: str, local: str = ".") -> None:
    with open(f"{local}/{file_name}", 'w') as f:
        f.write(content.strip())


def seconds_to_min(secs: float) -> str:
    minute, perc_sec = str(float(secs/60)).split('.')
    sec = math.ceil((int(perc_sec[:2])/100) * 60)
    return f"{minute:>02}:{sec:>02}"

In [5]:
boto_session = boto3.Session()
client_sagemaker = boto_session.client("sagemaker")
client_s3 = boto_session.client("s3")
sagemaker_session = sagemaker.Session()
sagemaker_role = sagemaker.get_execution_role()
bucket_name = sagemaker_session.default_bucket()

loca_transformed_path = "./data/transformed"
%store loca_transformed_path

Stored 'loca_transformed_path' (str)


In [7]:
%store -r

%store

Stored variables and their in-db values:
bucket_name                         -> 'sagemaker-us-east-1-891377318910'
bucket_prefix                       -> 'from-idea-to-production/xgboost'
col_target                          -> 'y'
dataset_file_local_path             -> 'data/bank-additional/bank-additional-full.csv'
dataset_raw                         -> 'bank-additional-full.csv'
domain_id                           -> 'd-ehxji4qaadry'
experiment_name                     -> 'itau-experiment-2024-10-13-18-09-47'
initialized                         -> True
input_s3_url                        -> 's3://sagemaker-us-east-1-891377318910/workshop_v2
loca_transformed_path               -> './data/transformed'
local_prefix                        -> './data/raw'
output_s3_url                       -> 's3://sagemaker-us-east-1-891377318910/workshop_v2
region                              -> 'us-east-1'
region_name                         -> 'us-east-1'
s3_data_raw_prefix                  -> 'data/

# Create an experiment

In [13]:
experiment_name = f"itau-experiment-{datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M-%S')}"
print(experiment_name)

itau-experiment-2024-10-13-18-09-47


In [14]:
%store experiment_name

Stored 'experiment_name' (str)


# Feature engineering

- selecao de atributos
- criacao e extracao de atributos
- transformacao de atributos
- codificacao de atributos categoricos
- tratamento de valores ausentes
- deteccao e tratamento de outliers
- normalizacao ou padronizacao
- tratamento de dados desbalanceados

### Load dataset

In [8]:
local_dataset_path = f"{local_prefix}/{dataset_raw}"

df_raw = pd.read_csv(local_dataset_path, sep=";")

In [9]:
df_raw.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## Opcao 1 - execucao local:

### Option 1.1

In [45]:
start_time = time.perf_counter()

df = df_raw.copy()

# remove unnecessary data
df = df.drop(
    ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
    axis=1,
)

# Indicator variable to capture when pdays takes a value of 999
df["no_previous_contact"] = np.where(df["pdays"] == 999, 1, 0)

# Indicator for individuals not actively employed
df["not_working"] = np.where(np.in1d(df["job"], ["student", "retired", "unemployed"]), 1, 0)

bins = [18, 30, 40, 50, 60, 70, 90]
labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

df['age_range'] = pd.cut(df.age, bins, labels=labels, include_lowest=True)
df = pd.concat([df, pd.get_dummies(df['age_range'], prefix='age', dtype=int)], axis=1)
df.drop('age', axis=1, inplace=True)
df.drop('age_range', axis=1, inplace=True)

scaled_features = ['pdays', 'previous', 'campaign']
df[scaled_features] = MinMaxScaler().fit_transform(df[scaled_features])

# Convert categorical variables to sets of indicators
df = pd.get_dummies(df, dtype=int)

# Replace "y_no" and "y_yes" with a single label column, and bring it to the front
df = pd.concat([
        df["y_yes"].rename(col_target),
        df.drop(["y_no", "y_yes"], axis=1),
    ],
    axis=1,
)

# Shuffle and splitting dataset
train_data, validation_data, test_data = np.split(
    df.sample(frac=1, random_state=1729),
    [int(0.7 * len(df)), int(0.9 * len(df))],
)

print(f"Data split -> train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

print("--------------------------------------------------")
time_execution = round(time.perf_counter() - start_time, 3)
print(f"Tempo de execucao: {time_execution}")

Data split -> train:(28831, 65) | validation:(8238, 65) | test:(4119, 65)
--------------------------------------------------
Tempo de execucao: 0.354


### Option 1.2

In [10]:
def preprocess(df_data):
    target_col = "y"

    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0)

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat([
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split -> train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

    baseline_data = df_model_data.drop([target_col], axis=1)

    return train_data, validation_data, test_data, baseline_data

In [11]:
df = df_raw.copy()

# Call the function locally
train_data, validation_data, test_data, baseline_data = preprocess(df)

Data split -> train:(28831, 65) | validation:(8238, 65) | test:(4119, 65)


In [12]:
print(loca_transformed_path)

./data/transformed


In [15]:
train_data.to_csv(os.path.join(loca_transformed_path, "train.csv"), index=False, header=False)
validation_data.to_csv(os.path.join(loca_transformed_path, "validation.csv"), index=False, header=False)
test_data.to_csv(os.path.join(loca_transformed_path, "test.csv"), index=False, header=False)

## Opcao 2 - execucao remota:

### Load data

In [15]:
local_dataset_path = f"{local_prefix}/{dataset_raw}"
df_raw = pd.read_csv(local_dataset_path, sep=";")

### Opcao 2.1

In [None]:
df = df_raw.copy()

In [15]:
def preprocess(df_data):
    target_col = "y"

    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0)

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat([
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split -> train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

    baseline_data = df_model_data.drop([target_col], axis=1)

    return train_data, validation_data, test_data, baseline_data

In [None]:
# %%writefile environment.yml
# name: remote_conda_env
# channels:
#   - defaults
# dependencies:
#   - python=3.10
#   - pandas
#   - pip:
#       - sagemaker>=2.156.0,<3

In [16]:
# %%writefile requirements.txt
# pandas
# numpy
# scikit-learn

In [17]:
requirements = f"""
pandas=={pd.__version__}
numpy=={np.__version__}
scikit-learn=={sklearn.__version__}
"""

save_file('requirements.txt', requirements)

In [51]:
# https://sagemaker.readthedocs.io/en/stable/remote_function/sagemaker.remote_function.html#remoteexecutor

s3_root_uri = f"s3://{bucket_name}/{s3_prefix}"
print(f"{s3_root_uri = }")

processing_instance_type = "ml.m5.large"
processing_instance_count = 1

with RemoteExecutor(
    dependencies="./requirements.txt", # "./environment.yml"
    s3_root_uri=s3_root_uri,
    sagemaker_session=sagemaker_session,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    max_parallel_jobs=1,
    # keep_alive_period_in_seconds=30,
) as executor:
    future = executor.submit(preprocess, df)
    # futures = [executor.submit(preprocess, x) for x in [1, 2, 3]]

2024-10-11 01:00:51,735 sagemaker.remote_function INFO     Serializing function code to s3://sagemaker-us-east-1-891377318910/workshop_v2/preprocess-2024-10-11-01-00-51-735/function
2024-10-11 01:00:51,870 sagemaker.remote_function INFO     Serializing function arguments to s3://sagemaker-us-east-1-891377318910/workshop_v2/preprocess-2024-10-11-01-00-51-735/arguments
2024-10-11 01:00:52,463 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpvl4y__xy/temp_workspace/sagemaker_remote_function_workspace/requirements.txt'
2024-10-11 01:00:52,466 sagemaker.remote_function INFO     Successfully created workdir archive at '/tmp/tmpvl4y__xy/workspace.zip'
2024-10-11 01:00:52,541 sagemaker.remote_function INFO     Successfully uploaded workdir to 's3://sagemaker-us-east-1-891377318910/workshop_v2/preprocess-2024-10-11-01-00-51-735/sm_rf_user_ws/workspace.zip'
2024-10-11 01:00:52,543 sagemaker.remote_function INFO     Creating job: preprocess-2024-10-1

--------------------------------------------------
Tempo de execucao: 251.69


In [52]:
# [future.result() for future in futures]
train_data, validation_data, test_data, baseline_data = future.result()

2024-10-11 01:04:44 Starting - Preparing the instances for training
2024-10-11 01:04:44 Downloading - Downloading the training image
2024-10-11 01:04:44 Training - Training image download completed. Training in progress.
2024-10-11 01:04:44 Uploading - Uploading generated training model
2024-10-11 01:04:44 Completed - Training job completed[34mINFO: CONDA_PKGS_DIRS is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/conda/pkgs'[0m
[34mINFO: PIP_CACHE_DIR is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/pip'[0m
[34mINFO: Bootstraping runtime environment.[0m
[34m2024-10-11 01:04:15,508 sagemaker.remote_function INFO     The job is running on non-root user: sagemaker-user. Adding write permissions to the following job output directories: ['/opt/ml/output', '/opt/ml/model', '/tmp'].[0m
[34m2024-10-11 01:04:15,508 sagemaker.remote_function INFO     Executing 'sudo chmod -R 777 /opt/ml/output /opt/ml/model /tmp'.[

In [None]:
print(f"baseline_data   - shape: {baseline_data.shape}")
print(f"train_data      - shape: {train_data.shape}")
print(f"test_data:      - shape: {test_data.shape}")
print(f"validation_data - shape: {validation_data.shape}")

In [54]:
train_data.head()

Unnamed: 0,y,campaign,pdays,previous,no_previous_contact,not_working,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,month_apr,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
40949,0,0.036364,1.0,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
9332,0,0.018182,1.0,0.0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
32286,0,0.018182,1.0,0.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0
3925,0,0.036364,1.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
9406,0,0.018182,1.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0


### Opcao 2.2

In [16]:
requirements = f"""
pandas=={pd.__version__}
numpy=={np.__version__}
scikit-learn=={sklearn.__version__}
"""

save_file('requirements.txt', requirements)

In [30]:
print(sys.executable)

# Print the location of the admin config file
print(os.path.join(site_config_dir("sagemaker"), "config.yaml"))

# Print the location of the user config file
print(os.path.join(user_config_dir("sagemaker"), "config.yaml"))

/opt/conda/bin/python
/etc/xdg/sagemaker/config.yaml
/home/sagemaker-user/.config/sagemaker/config.yaml


In [32]:
%%writefile config.yaml

SchemaVersion: '1.0'
SageMaker:
    PythonSDK:
        Modules:
              RemoteFunction:
                # role arn is not required if in SageMaker Notebook instance or SageMaker Studio
                # Uncomment the following line and replace with the right execution role if in a local IDE
                # RoleArn: <replace the role arn here>
                # JobCondaEnvironment: 'python'
                # IncludeLocalWorkDir: true
                # EnvironmentVariables: {'EnvVarKey': 'EnvVarValue'}
                # EnableInterContainerTrafficEncryption: true
                InstanceType: ml.m5.large # 'ml.m5.large'
                Dependencies: ./requirements.txt
                IncludeLocalWorkDir: true
                CustomFileFilter:
                      IgnoreNamePatterns: # files or directories to ignore
                          - "*.ipynb" # all notebook files
                          - "__pycache__"
                          - "data"

Overwriting config.yaml


In [18]:
# Set path to config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

In [24]:
@remote(keep_alive_period_in_seconds=600)
def preprocess(df_data):
    target_col = "y"

    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0)

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat([
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split -> train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

    baseline_data = df_model_data.drop([target_col], axis=1)

    return train_data, validation_data, test_data, baseline_data

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


In [25]:
df = df_raw.copy()

In [26]:
train_data, validation_data, test_data, baseline_data = preprocess(df)

2024-10-13 16:05:03,171 sagemaker.remote_function INFO     Serializing function code to s3://sagemaker-us-east-1-891377318910/preprocess-2024-10-13-16-05-03-170/function
2024-10-13 16:05:03,249 sagemaker.remote_function INFO     Serializing function arguments to s3://sagemaker-us-east-1-891377318910/preprocess-2024-10-13-16-05-03-170/arguments
2024-10-13 16:05:03,544 sagemaker.remote_function INFO     Copied user workspace to '/tmp/tmpqxr6c0mw/temp_workspace/sagemaker_remote_function_workspace'
2024-10-13 16:05:03,548 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpqxr6c0mw/temp_workspace/sagemaker_remote_function_workspace/requirements.txt'
2024-10-13 16:05:03,669 sagemaker.remote_function INFO     Successfully created workdir archive at '/tmp/tmpqxr6c0mw/workspace.zip'
2024-10-13 16:05:03,794 sagemaker.remote_function INFO     Successfully uploaded workdir to 's3://sagemaker-us-east-1-891377318910/preprocess-2024-10-13-16-05-03-170/sm_r

2024-10-13 16:05:04 Starting - Starting the training job...
2024-10-13 16:05:19 Starting - Preparing the instances for training...
2024-10-13 16:05:50 Downloading - Downloading input data...
2024-10-13 16:06:15 Downloading - Downloading the training image.........
2024-10-13 16:07:57 Training - Training image download completed. Training in progress..[34mINFO: CONDA_PKGS_DIRS is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/conda/pkgs'[0m
[34mINFO: PIP_CACHE_DIR is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/pip'[0m
[34mINFO: Bootstraping runtime environment.[0m
[34m2024-10-13 16:07:58,493 sagemaker.remote_function INFO     The job is running on non-root user: sagemaker-user. Adding write permissions to the following job output directories: ['/opt/ml/output', '/opt/ml/model', '/tmp'].[0m
[34m2024-10-13 16:07:58,493 sagemaker.remote_function INFO     Executing 'sudo chmod -R 777 /opt/ml/output /opt/ml/mod

In [27]:
print(f"baseline_data   - shape: {baseline_data.shape}")
print(f"train_data      - shape: {train_data.shape}")
print(f"test_data:      - shape: {test_data.shape}")
print(f"validation_data - shape: {validation_data.shape}")

baseline_data   - shape: (41188, 64)
train_data      - shape: (28831, 65)
test_data:      - shape: (4119, 65)
validation_data - shape: (8238, 65)


In [28]:
train_data.head()

Unnamed: 0,y,campaign,pdays,previous,no_previous_contact,not_working,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,month_apr,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
40949,0,0.036364,1.0,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
9332,0,0.018182,1.0,0.0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
32286,0,0.018182,1.0,0.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0
3925,0,0.036364,1.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
9406,0,0.018182,1.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0


### Opcao 2.3

In [49]:
requirements = f"""
pandas=={pd.__version__}
numpy=={np.__version__}
scikit-learn=={sklearn.__version__}
"""

save_file('requirements.txt', requirements)

In [59]:
%%writefile config.yaml

SchemaVersion: '1.0'
SageMaker:
  PythonSDK:
    Modules:
      RemoteFunction:
        # JobCondaEnvironment: 'python' # nome do ambiente conda
        IncludeLocalWorkDir: true
        EnvironmentVariables: {'EnvVarKey': 'EnvVarValue'}
        EnableInterContainerTrafficEncryption: true
        Dependencies: ./requirements.txt
        IncludeLocalWorkDir: true
        CustomFileFilter:
          IgnoreNamePatterns: # files or directories to ignore
          - "*.ipynb" # all notebook files
          - "__pycache__"
          - "data"

Overwriting config.yaml


In [60]:
@remote(instance_type="ml.m5.large",
        # use_spot_instances=False,
        max_runtime_in_seconds=600,
        max_wait_time_in_seconds=900)
def preprocess(df_data):
    target_col = "y"

    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0)

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat([
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split -> train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

    baseline_data = df_model_data.drop([target_col], axis=1)

    return train_data, validation_data, test_data, baseline_data

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.EnvironmentVariables
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.EnableInterContainerTrafficEncryption


In [61]:
df = df_raw.copy()

In [62]:
# processing_instance_type = "ml.m5.large"
# processing_instance_count = 1
# s3_root_uri = f"s3://{bucket_name}/{s3_prefix}"
# print(f"{s3_root_uri = }")

In [63]:
run_suffix = datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M-%S')
run_name = f"remote-function-processing-{run_suffix}"

# Run the function remotely as a SageMaker job
with Run(
    experiment_name=experiment_name,
    run_name=run_name,
    run_display_name="remote-function-processing",
    sagemaker_session=sagemaker_session
) as run:
    train_data, validation_data, test_data, baseline_data = preprocess(df)

2024-10-13 16:42:50,461 sagemaker.remote_function INFO     Serializing function code to s3://sagemaker-us-east-1-891377318910/preprocess-2024-10-13-16-42-50-461/function
2024-10-13 16:42:50,531 sagemaker.remote_function INFO     Serializing function arguments to s3://sagemaker-us-east-1-891377318910/preprocess-2024-10-13-16-42-50-461/arguments
2024-10-13 16:42:50,842 sagemaker.remote_function INFO     Copied user workspace to '/tmp/tmp3a16maak/temp_workspace/sagemaker_remote_function_workspace'
2024-10-13 16:42:50,847 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmp3a16maak/temp_workspace/sagemaker_remote_function_workspace/requirements.txt'
2024-10-13 16:42:50,854 sagemaker.remote_function INFO     Successfully created workdir archive at '/tmp/tmp3a16maak/workspace.zip'
2024-10-13 16:42:50,909 sagemaker.remote_function INFO     Successfully uploaded workdir to 's3://sagemaker-us-east-1-891377318910/preprocess-2024-10-13-16-42-50-461/sm_r

2024-10-13 16:42:51 Starting - Starting the training job...
2024-10-13 16:43:06 Starting - Preparing the instances for training...
2024-10-13 16:43:36 Downloading - Downloading input data...
2024-10-13 16:44:02 Downloading - Downloading the training image.........
2024-10-13 16:45:43 Training - Training image download completed. Training in progress..[34mINFO: CONDA_PKGS_DIRS is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/conda/pkgs'[0m
[34mINFO: PIP_CACHE_DIR is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/pip'[0m
[34mINFO: Bootstraping runtime environment.[0m
[34m2024-10-13 16:45:46,424 sagemaker.remote_function INFO     The job is running on non-root user: sagemaker-user. Adding write permissions to the following job output directories: ['/opt/ml/output', '/opt/ml/model', '/tmp'].[0m
[34m2024-10-13 16:45:46,424 sagemaker.remote_function INFO     Executing 'sudo chmod -R 777 /opt/ml/output /opt/ml/mod

In [64]:
print(f"baseline_data   - shape: {baseline_data.shape}")
print(f"train_data      - shape: {train_data.shape}")
print(f"test_data:      - shape: {test_data.shape}")
print(f"validation_data - shape: {validation_data.shape}")

baseline_data   - shape: (41188, 64)
train_data      - shape: (28831, 65)
test_data:      - shape: (4119, 65)
validation_data - shape: (8238, 65)


In [65]:
train_data.head(5)

Unnamed: 0,y,campaign,pdays,previous,no_previous_contact,not_working,age_18-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-plus,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,month_apr,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
40949,0,0.036364,1.0,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0
9332,0,0.018182,1.0,0.0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
32286,0,0.018182,1.0,0.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0
3925,0,0.036364,1.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
9406,0,0.018182,1.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0


## Opcao 3 - Processing:

In [66]:
!aws s3 ls {bucket_name}/{s3_prefix}/{s3_data_raw_prefix}/{dataset_raw} --recursive

2024-10-11 00:28:13    5834924 workshop_v2/data/raw/bank-additional-full.csv


In [13]:
%%writefile ./code/preprocessing.py

import os
import logging
import pandas as pd
import numpy as np
import argparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

logging.basicConfig(
    format="[%(asctime)s] %(levelname)s %(name)s %(filename)s %(funcName)s %(lineno)d: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def _parse_args():
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    parser.add_argument('--logger_level', type=str, default='INFO')
    return parser.parse_known_args()


def process_data(df_data):
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat(
        [
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )
    
    return df_model_data


if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()
    logger = logging.getLogger(__name__)
    logger.setLevel(args.logger_level)
    target_col = "y"
    logger.info("TESTANDO")
    # process data
    df_model_data = process_data(pd.read_csv(os.path.join(args.filepath, args.filename), sep=";"))
    
    train_data, temp_data = train_test_split(df_model_data, test_size=0.3, random_state=42, shuffle=True)
    validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True)

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    # Save datasets locally
    train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=False)
    validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=False)
    test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=False)
    test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=False)
    
    # Save the baseline dataset for model monitoring
    df_model_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=False, header=True)
    
    print("## Processing complete. Exiting.")

Overwriting ./code/preprocessing.py


In [14]:
s3_data_transformed_prefix = "data/transformed"

train_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/train"
validation_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/validation"
test_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/test"
baseline_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/baseline"

print(f"{train_s3_url = }")
print(f"{validation_s3_url = }")
print(f"{test_s3_url = }")
print(f"{baseline_s3_url = }")

train_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/train'
validation_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/validation'
test_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/test'
baseline_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/baseline'


In [83]:
# %store train_s3_url
# %store validation_s3_url
# %store test_s3_url
# %store baseline_s3_url

### Create a processor

In [15]:
skprocessor_framework_version = "1.2-1"
processing_instance_type = "ml.m5.large"
processing_instance_count = 1

In [16]:
sklearn_processor = SKLearnProcessor(
    framework_version=skprocessor_framework_version,
    role=sagemaker_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="itau-processing",
    sagemaker_session=sagemaker_session,
)

processing_inputs = [
    ProcessingInput(
        source=input_s3_url,
        destination="/opt/ml/processing/input",
        s3_input_mode="File",
        s3_data_distribution_type="ShardedByS3Key"
    )
]

processing_outputs = [
    ProcessingOutput(
        output_name="train_data",
        source="/opt/ml/processing/output/train",
        destination=train_s3_url,
    ),
    ProcessingOutput(
        output_name="validation_data",
        source="/opt/ml/processing/output/validation",
        destination=validation_s3_url
    ),
    ProcessingOutput(
        output_name="test_data",
        source="/opt/ml/processing/output/test",
        destination=test_s3_url
    ),
    ProcessingOutput(
        output_name="baseline_data",
        source="/opt/ml/processing/output/baseline",
        destination=baseline_s3_url
    ),
]

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [17]:
try:
    sklearn_processor.run(
        inputs=processing_inputs,
        outputs=processing_outputs,
        code='code/preprocessing.py',
        wait=False,
        # experiment_config=experiment_config,
        # arguments = ['arg1', 'arg2'],
    )
    count_time = 0
    time_spleep = 20
    while client_sagemaker.describe_processing_job(
        ProcessingJobName=sklearn_processor._current_job_name
    )["ProcessingJobStatus"] != "Completed":
        time.sleep(time_spleep)
        count_time += time_spleep
        print(f"Wait until {sklearn_processor._current_job_name} completed | Time running: {seconds_to_min(count_time)} min")
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e
finally:
    print("Finished!")

INFO:sagemaker:Creating processing-job with name itau-processing-2024-10-15-00-24-40-948


Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 00:20 min
Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 00:40 min
Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 01:00 min
Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 01:20 min
Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 01:40 min
Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 02:00 min
Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 02:20 min
Wait until itau-processing-2024-10-15-00-24-40-948 completed | Time running: 02:40 min
Finished!


In [88]:
# list the uploaded files
!aws s3 ls {bucket_name}/{s3_prefix}/{s3_data_transformed_prefix} --recursive

2024-10-13 16:50:41    6010885 workshop_v2/data/transformed/baseline/baseline.csv
2024-10-13 16:50:40     901714 workshop_v2/data/transformed/test/test_x.csv
2024-10-13 16:50:40      12360 workshop_v2/data/transformed/test/test_y.csv
2024-10-13 16:50:40    4266258 workshop_v2/data/transformed/train/train.csv
2024-10-13 16:50:40     914747 workshop_v2/data/transformed/validation/validation.csv


## Opcao 5 - Processing e Feature store:

In [19]:
!aws s3 ls {bucket_name}/{s3_prefix}/{s3_data_raw_prefix}/{dataset_raw} --recursive

2024-10-11 00:28:13    5834924 workshop_v2/data/raw/bank-additional-full.csv


In [74]:
s3_prefix_fs = 'feature-store'
feature_group_name = 'itau-feature-group'

In [80]:
%%writefile code/preprocessing_fs.py

import os
import sys
import time
import subprocess
import logging
import pandas as pd
import numpy as np
import argparse
import boto3
# import sagemaker
from datetime import datetime, timezone, date
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
# from sagemaker import get_execution_role
# from sagemaker.feature_store.feature_group import FeatureGroup
# from sagemaker.feature_store.inputs import FeatureParameter, TableFormatEnum
# from sagemaker.feature_store.feature_definition import StringFeatureDefinition

# logging.basicConfig(
#     format="[%(asctime)s] %(levelname)s %(name)s %(filename)s %(funcName)s %(lineno)d: %(message)s",
#     datefmt="%Y-%m-%d %H:%M:%S",
# )


class FeatureGroupHandler:

    def __init__(self, sm_session, name: str, sm_role, client_sm):
        self.sagemaker_session = sm_session
        self.client_sm_fs = self.create_client_sm_fs(sm_session)
        self.client_sm = client_sm
        self.name = name
        self.sm_role = sm_role
        self._feature_group = FeatureGroup(name=name, sagemaker_session=sm_session)

    @property
    def feature_group(self):
        return self._feature_group

    def create_client_sm_fs(self, sm_session):
        return sm_session.boto_session.client(
            service_name="sagemaker-featurestore-runtime",
            region_name=sm_session.boto_session.region_name
        )

    def check_if_exist(self) -> bool:
        feature_groups = self.client_sm.search(Resource="FeatureGroup")
        fg_names = [ft['FeatureGroup']['FeatureGroupName'] for ft in feature_groups['Results']]
        return self.name in fg_names

    def get_or_create_by_df(self, df: pd.DataFrame, bucket: str, prefix: str, ft_id_name: str, ft_time_name: str = "event_time"):
        if not self.check_if_exist():
            self._feature_group.load_feature_definitions(data_frame=df)
            self.create(bucket, prefix, ft_id_name, ft_time_name)
            self.wait_for_creation_complete()

    def create(self, bucket: str, prefix: str, ft_id_name: str, ft_time_name: str):
        self._feature_group.create(
            s3_uri=f's3://{bucket}/{prefix}/{self.name}', 
            record_identifier_name=ft_id_name, 
            event_time_feature_name=ft_time_name, 
            role_arn=self.sm_role, 
            enable_online_store=False,
            # table_format=TableFormatEnum.ICEBERG 
        )

    def wait_for_creation_complete(self):
        status = self._feature_group.describe().get('FeatureGroupStatus')
        while status == 'Creating':
            print(f'Waiting for feature group: {self._feature_group.name} to be created ...')
            time.sleep(5)
            status = self._feature_group.describe().get('FeatureGroupStatus')
        if status != 'Created':
            raise SystemExit(f'Failed to create feature group {self._feature_group.name}: {status}')
        print(f'FeatureGroup {self._feature_group.name} was successfully created.')

    def update_feature(self, ft_name: str, ft_desc: str):
        self._feature_group.update_feature_metadata(
            feature_name=ft_name,
            description=ft_desc,
        )

    def ingest_data(self, df: pd.DataFrame, max_workers: int = 10, wait: bool = True):
        self._feature_group.ingest(
            data_frame=df,
            max_workers=max_workers,
            wait=wait
        )

    def put_data(self, df: pd.DataFrame):
        try:
            feature_values_list = list()
            for index, row in df.iterrows():
                # Iterate through each column for the current row
                for column in df.columns:
                    feature_values = FeatureValue(
                        feature_name=str(column), value_as_string=str(row[column])
                    )
                    feature_values_list.append(feature_values)
                self._feature_group.put_record(record=feature_values_list)
                feature_values_list.clear()
            SuccessString = f"The dataframe with {len(df)} rows has been ingested successfully for feature group {feature_group.get_name()}"
            return SuccessString
        except Exception as e:
            # Handle any other exceptions
            print(f"An unexpected error occurred: {e}")
            return None
    
    def get_description(self):
        return self._feature_group.describe()

    def get_feature_description(self, ft_name: str):
        return self._feature_group.describe_feature_metadata(ft_name)

    def get_record(self, ft_name):
        return self.sagemaker_session.boto_session.client(
            service_name="sagemaker-featurestore-runtime",
            region_name=self.sagemaker_session.boto_session.region_name
        ).get_record(
            FeatureGroupName=self.name,
            RecordIdentifierValueAsString=str(ft_name)
        )

    def delete(self):
        self._feature_group.delete()


def _parse_args():
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    parser.add_argument('--bucket_fs', type=str, default='sagemaker-us-east-1-891377318910')
    parser.add_argument('--prefix_fs', type=str, default='workshop/feature-store')
    parser.add_argument('--logger_level', type=str, default='INFO')
    return parser.parse_known_args()

def generate_event_timestamp():
    naive_dt = datetime.now()
    aware_dt = naive_dt.astimezone()
    utc_dt = aware_dt.astimezone(timezone.utc)
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time

def convert_col_name(col):
    return col.replace('.', '_').replace('-', '_').rstrip('_')

def process_data(df_data):
    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat([
        df_model_data["y_yes"].rename(target_col),
        df_model_data.drop(["y_no", "y_yes"], axis=1),
    ], axis=1)
    
    return df_model_data


if __name__=="__main__":
    args, _ = _parse_args()

    logger = logging.getLogger("SagemakerStudio")
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler())
    target_col = "y"

    logger.info("TESTANDO LOGIN")
    print("TESTANDO")
    region = boto3.Session().region_name
    if region is None:
        region = "us-east-1"
    boto_session = boto3.Session(region_name=region)
    os.environ['AWS_DEFAULT_REGION'] = region

    subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker==2.135.0"])
    import sagemaker
    from sagemaker import get_execution_role
    from sagemaker.session import Session
    from sagemaker.feature_store.feature_group import FeatureGroup
    from sagemaker.feature_store.inputs import FeatureParameter, TableFormatEnum
    from sagemaker.feature_store.feature_definition import StringFeatureDefinition

    sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
    featurestore_runtime = boto_session.client(service_name="sagemaker-featurestore-runtime", region_name=region)

    sagemaker_session = Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_featurestore_runtime_client=featurestore_runtime,
    )

    # sagemaker_session = sagemaker.Session()
    sagemaker_role = sagemaker.get_execution_role()
    feature_group_name = "itau-feature-group"

    # process data
    df_model_data = process_data(pd.read_csv(os.path.join(args.filepath, args.filename), sep=";"))

    df_train, df_temp = train_test_split(df_model_data, test_size=0.3, random_state=42)
    df_valid, df_test = train_test_split(df_temp, test_size=0.5, random_state=42) 
    df_model_data['set'] = 'train'
    df_model_data.loc[df_valid.index, 'set'] = 'validation'
    df_model_data.loc[df_test.index, 'set'] = 'test'

    del df_train
    del df_temp
    del df_valid
    del df_test
    
    df_model_data['event_time'] = generate_event_timestamp()
    df_model_data['record_id'] = [f'R{i}' for i in range(len(df_model_data))]
    df_model_data = df_model_data.rename(columns=convert_col_name)
    df_model_data = df_model_data.convert_dtypes(infer_objects=True, convert_boolean=False)
    df_model_data['record_id'] = df_model_data['record_id'].astype('string')
    df_model_data['event_time'] = df_model_data['event_time'].astype('string')
    df_model_data=df_model_data.astype(str) # Nao faca isso

    feature_group = FeatureGroupHandler(
        sm_session = sagemaker_session,
        name = feature_group_name,
        sm_role = sagemaker_role,
        client_sm = sagemaker_client,
    )

    feature_group.get_or_create_by_df(
        df=df_model_data,
        bucket=args.bucket_fs,
        prefix=args.prefix_fs,
        ft_id_name="record_id",
        ft_time_name="event_time"
    )

    feature_group.ingest_data(df_model_data)

    print("## Processing complete. Exiting.")

Overwriting code/preprocessing_fs.py


In [76]:
s3_data_transformed_prefix = "data/transformed"

train_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/train"
validation_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/validation"
test_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/test"
baseline_s3_url = f"s3://{bucket_name}/{s3_prefix}/{s3_data_transformed_prefix}/baseline"

In [77]:
print(f"{train_s3_url = }")
print(f"{validation_s3_url = }")
print(f"{test_s3_url = }")
print(f"{baseline_s3_url = }")

print(f"{input_s3_url = }")

train_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/train'
validation_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/validation'
test_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/test'
baseline_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/transformed/baseline'
input_s3_url = 's3://sagemaker-us-east-1-891377318910/workshop_v2/data/raw/bank-additional-full.csv'


In [78]:
skprocessor_framework_version = "1.2-1"
processing_instance_type = "ml.m5.large"
processing_instance_count = 1

In [81]:
sklearn_processor = SKLearnProcessor(
    framework_version=skprocessor_framework_version,
    role=sagemaker_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name='itau-processing',
    sagemaker_session=sagemaker_session,
)

processing_inputs = [
    ProcessingInput(
        source=input_s3_url,
        destination="/opt/ml/processing/input",
        s3_input_mode="File",
        s3_data_distribution_type="ShardedByS3Key"
    )
]

processing_outputs = [
    ProcessingOutput(
        output_name="train_data",
        source="/opt/ml/processing/output/train",
        destination=train_s3_url,
    ),
    ProcessingOutput(
        output_name="validation_data",
        source="/opt/ml/processing/output/validation",
        destination=validation_s3_url
    ),
    ProcessingOutput(
        output_name="test_data",
        source="/opt/ml/processing/output/test",
        destination=test_s3_url
    ),
    ProcessingOutput(
        output_name="baseline_data",
        source="/opt/ml/processing/output/baseline",
        destination=baseline_s3_url
    ),
]

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [82]:
try:
    sklearn_processor.run(
        inputs=processing_inputs,
        outputs=processing_outputs,
        code='code/preprocessing_fs.py',
        wait=False,
        # experiment_config=experiment_config,
        # arguments = ['arg1', 'arg2'],
    )
    count_time = 0
    time_spleep = 20
    while client_sagemaker.describe_processing_job(
        ProcessingJobName=sklearn_processor._current_job_name
    )["ProcessingJobStatus"] != "Completed":
        time.sleep(time_spleep)
        count_time += time_spleep
        print(f"Wait until {sklearn_processor._current_job_name} completed | Time running: {seconds_to_min(count_time)} min")
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e
finally:
    print("Finished!")

INFO:sagemaker:Creating processing-job with name itau-processing-2024-10-13-19-30-25-494


Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 00:20 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 00:40 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 01:00 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 01:20 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 01:40 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 02:00 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 02:20 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 02:40 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 03:00 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 03:20 min
Wait until itau-processing-2024-10-13-19-30-25-494 completed | Time running: 03:40 min
Wait until itau-processing-2024-10-13-19-30