# Install dependencies

In [None]:
%pip install -U -r requirements.txt

# Import SageMaker Defaults Configurations

The Amazon SageMaker Python SDK supports setting of default values for AWS infrastructure primitive types, such as instance types, Amazon S3 folder locations, and IAM roles. You can override the default locations of these files by setting the `SAGEMAKER_USER_CONFIG_OVERRIDE` environment variables for the user-defined configuration file paths.

In [None]:
import os

# Use the current working directory as the location for SageMaker Python SDK config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

# Download dataset

Download the dataset from the UCI website.

In [None]:
import urllib
import os

input_data_dir = 'data/'
if not os.path.exists(input_data_dir):
    os.makedirs(input_data_dir)
input_data_path = os.path.join(input_data_dir, 'predictive_maintenance_raw_data_header.csv')
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv"
urllib.request.urlretrieve(dataset_url, input_data_path)

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv(input_data_path)

print('The shape of the dataset is:', df.shape)

# Test case 1: Run a SageMaker Job without Networking configurations

## Expected result: Job should fail 

In [None]:
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sagemaker.remote_function import remote

@remote(job_name_prefix="amzn-sm-btd-preprocess")
def preprocess(df):
    columns = ['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Machine failure']
    cat_columns = ['Type']
    num_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
    target_column = 'Machine failure'

    df = df[columns]

    training_ratio = 0.8
    validation_ratio = 0.1
    test_ratio = 0.1

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    print(f'Splitting data training ({training_ratio}), validation ({validation_ratio}), and test ({test_ratio}) sets ')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=0, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_ratio/(validation_ratio+training_ratio), random_state=2, stratify=y_train)

    # Apply transformations
    transformer = ColumnTransformer(transformers=[('numeric', StandardScaler(), num_columns),
                                                  ('categorical', OneHotEncoder(), cat_columns)],
                                    remainder='passthrough')
    featurizer_model = transformer.fit(X_train)
    X_train = featurizer_model.transform(X_train)
    X_val = featurizer_model.transform(X_val)

    print(f'Shape of train features after preprocessing: {X_train.shape}')
    print(f'Shape of validation features after preprocessing: {X_val.shape}')
    print(f'Shape of test features after preprocessing: {X_test.shape}')
    
    y_train = y_train.values.reshape(-1)
    y_val = y_val.values.reshape(-1)
    
    print(f'Shape of train labels after preprocessing: {y_train.shape}')
    print(f'Shape of validation labels after preprocessing: {y_val.shape}')
    print(f'Shape of test labels after preprocessing: {y_test.shape}')

    model_file_path="/opt/ml/model/sklearn_model.joblib"
    os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
    joblib.dump(featurizer_model, model_file_path)

    return X_train, y_train, X_val, y_val, X_test, y_test, featurizer_model

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, featurizer_model = preprocess(df)

# Test case 2: Run a SageMaker Job with Networking configurations

## Expected result: The job should be successfully executed

## Subnet and security group definition

#### ! Important: Populate subnets and security_group_ids by using the exported values from the CloudFormation template

In [None]:
subnets = []
security_group_ids = []

In [None]:
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sagemaker.remote_function import remote

@remote(job_name_prefix="amzn-sm-btd-preprocess", subnets=subnets, security_group_ids=security_group_ids)
def preprocess(df):
    columns = ['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Machine failure']
    cat_columns = ['Type']
    num_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
    target_column = 'Machine failure'

    df = df[columns]

    training_ratio = 0.8
    validation_ratio = 0.1
    test_ratio = 0.1

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    print(f'Splitting data training ({training_ratio}), validation ({validation_ratio}), and test ({test_ratio}) sets ')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=0, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_ratio/(validation_ratio+training_ratio), random_state=2, stratify=y_train)

    # Apply transformations
    transformer = ColumnTransformer(transformers=[('numeric', StandardScaler(), num_columns),
                                                  ('categorical', OneHotEncoder(), cat_columns)],
                                    remainder='passthrough')
    featurizer_model = transformer.fit(X_train)
    X_train = featurizer_model.transform(X_train)
    X_val = featurizer_model.transform(X_val)

    print(f'Shape of train features after preprocessing: {X_train.shape}')
    print(f'Shape of validation features after preprocessing: {X_val.shape}')
    print(f'Shape of test features after preprocessing: {X_test.shape}')
    
    y_train = y_train.values.reshape(-1)
    y_val = y_val.values.reshape(-1)
    
    print(f'Shape of train labels after preprocessing: {y_train.shape}')
    print(f'Shape of validation labels after preprocessing: {y_val.shape}')
    print(f'Shape of test labels after preprocessing: {y_test.shape}')

    model_file_path="/opt/ml/model/sklearn_model.joblib"
    os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
    joblib.dump(featurizer_model, model_file_path)

    return X_train, y_train, X_val, y_val, X_test, y_test, featurizer_model

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, featurizer_model = preprocess(df)