### Setup

In [2]:
!pip3 install -U sagemaker

Collecting sagemaker
  Using cached sagemaker-2.232.1-py3-none-any.whl.metadata (16 kB)
Collecting boto3<2.0,>=1.34.142 (from sagemaker)
  Downloading boto3-1.35.32-py3-none-any.whl.metadata (6.6 kB)
Collecting sagemaker-core<2.0.0,>=1.0.0 (from sagemaker)
  Using cached sagemaker_core-1.0.9-py3-none-any.whl.metadata (4.9 kB)
Collecting botocore<1.36.0,>=1.35.32 (from boto3<2.0,>=1.34.142->sagemaker)
  Downloading botocore-1.35.32-py3-none-any.whl.metadata (5.6 kB)
Collecting mock<5.0,>4.0 (from sagemaker-core<2.0.0,>=1.0.0->sagemaker)
  Using cached mock-4.0.3-py3-none-any.whl.metadata (2.8 kB)
Using cached sagemaker-2.232.1-py3-none-any.whl (1.6 MB)
Downloading boto3-1.35.32-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hUsing cached sagemaker_core-1.0.9-py3-none-any.whl (384 kB)
Downloading botocore-1.35.32-py3-none-any.whl (12.6 MB)
[2K   [90m━━━━━━━━━━━━━━━

In [4]:
import boto3

# Setting up the S3 client and bucket details
bucket_name = 'sagemaker-us-east-1-807494057176'
file_key = 'root/AAI-540_Predictive-Maintenance-for-Pharmaceutical-Manufacturing-Equipment/predictive_maintenance_dataset.csv'

# Loading the dataset from S3
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
df = pd.read_csv(response['Body'])

# Displaying the first few rows to verify the dataset
df.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


### Train, Test, Validate and Production data

Let's split the data as follows: 40% for training, 10% for validation, 10% for testing, and set 40% aside for our production dataset. We'll drop the 'device' field from the training, validation, and testing sets, as it is not a useful feature for training purposes. For our production set, however, we keep the 'device' feature. We may want to filter it out prior to running our inferences so that the input data features match those of the training set, and later use it to join with the inference results.

In [None]:
import pandas as pd
import numpy as np

# Random split for the data 
rand_split = np.random.rand(len(df))

# Defining the masks for each split
train_mask = rand_split < 0.4
val_mask = (rand_split >= 0.4) & (rand_split < 0.5)
test_mask = (rand_split >= 0.5) & (rand_split < 0.6)
prod_mask = rand_split >= 0.6

# Creating the datasets
data_train = df[train_mask].drop(["device"], axis=1)  # Drop 'device' column for training
data_val = df[val_mask].drop(["device"], axis=1)      # Drop 'device' column for validation
data_test = df[test_mask].drop(["device"], axis=1)    # Drop 'device' column for testing
data_prod = df[prod_mask].drop(["device"], axis=1)    # Drop 'device' column for production

# Outputting the shapes of the splits to verify
(data_train.shape, data_val.shape, data_test.shape, data_prod.shape)

In [8]:
import sagemaker

# Initialize SageMaker session
sess = sagemaker.Session()

# Define the S3 bucket prefix based on your project
prefix = "predictive-maintenance-dataset"

# Define file names for each dataset
train_file = "train_data.csv"
validation_file = "validation_data.csv"
test_file = "test_data.csv"
prod_file = "prod_data.csv"

# Save each dataset to CSV and upload to S3
data_train.to_csv(train_file, index=False, header=False)
sess.upload_data(train_file, key_prefix="{}/train".format(prefix))

data_val.to_csv(validation_file, index=False, header=False)
sess.upload_data(validation_file, key_prefix="{}/validation".format(prefix))

data_test.to_csv(test_file, index=False, header=False)
sess.upload_data(test_file, key_prefix="{}/test".format(prefix))

data_prod.to_csv(prod_file, index=False, header=False)
sess.upload_data(prod_file, key_prefix="{}/prod".format(prefix))

's3://sagemaker-us-east-1-807494057176/predictive-maintenance-dataset/prod/prod_data.csv'

### Set up a benchmark model

Prepare the Data

In [9]:
# Selecting two features for a simple benchmark model
features = ['metric1', 'metric2', 'failure']
train_benchmark = data_train[features]

# Save the benchmark dataset to CSV
benchmark_train_file = "train_benchmark.csv"
train_benchmark.to_csv(benchmark_train_file, index=False, header=True)

Upload Benchmark Dataset to S3

In [16]:
# Upload the benchmark dataset to S3
sess.upload_data(benchmark_train_file, key_prefix="{}/benchmark/train".format(prefix))

's3://sagemaker-us-east-1-807494057176/predictive-maintenance-dataset/benchmark/train/train_benchmark.csv'

Provide Training Input for SageMaker

In [17]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(s3_input_train, content_type='csv')

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assume data_train is available and contains the dataset
features = ['metric1', 'metric2']
target = 'failure'

# Prepare the features and target
X = data_train[features].to_numpy()
y = data_train[target].to_numpy()

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Validate the model
y_pred = rf_model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.9983985587028326
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9980
           1       0.00      0.00      0.00        11

    accuracy                           1.00      9991
   macro avg       0.50      0.50      0.50      9991
weighted avg       1.00      1.00      1.00      9991



In [25]:
import joblib

# Save the model to a file
model_file = 'random_forest_model.pkl'
joblib.dump(rf_model, model_file)

['random_forest_model.pkl']

In [26]:
import boto3

# Upload the model to S3
s3_model_key = "{}/models/{}".format(prefix, model_file)
s3_model_path = 's3://{}/{}'.format(bucket, s3_model_key)
boto3.client('s3').upload_file(model_file, bucket, s3_model_key)

print("Model uploaded to:", s3_model_path)

Model uploaded to: s3://sagemaker-us-east-1-807494057176/predictive-maintenance-dataset/models/random_forest_model.pkl


Step 1: Create a .tar.gz Archive

In [34]:
import tarfile

# Add inference.py to the tar.gz archive
model_tar_file = 'random_forest_model.tar.gz'
with tarfile.open(model_tar_file, mode='w:gz') as tar:
    tar.add('random_forest_model.pkl', arcname='random_forest_model.pkl')
    tar.add('inference.py', arcname='inference.py')


Step 2: Upload the .tar.gz Archive to S3

In [35]:
# Upload the tar.gz archive to S3
s3_model_key = "{}/models/{}".format(prefix, model_tar_file)
s3_model_path = 's3://{}/{}'.format(bucket, s3_model_key)
boto3.client('s3').upload_file(model_tar_file, bucket, s3_model_key)

print("Model archive uploaded to:", s3_model_path)

Model archive uploaded to: s3://sagemaker-us-east-1-807494057176/predictive-maintenance-dataset/models/random_forest_model.tar.gz


Step 3: Deploy the Model Using the .tar.gz Archive

In [37]:
from sagemaker.sklearn import SKLearnModel

# Create a SageMaker SKLearnModel
sklearn_model = SKLearnModel(
    model_data=s3_model_path,  # Use the path to the tar.gz archive
    role=role,
    entry_point='inference.py',  # This should contain the inference logic
    framework_version='1.0-1',
    py_version='py3',
    sagemaker_session=sess
)

# Deploy the model with a valid instance type
sklearn_predictor = sklearn_model.deploy(
    instance_type='ml.m5.large',
    initial_instance_count=1,
    wait=False  # This allows the script to continue even if the deployment fails
)


print("Model deployed. Endpoint name:", sklearn_predictor.endpoint_name)

Model deployed. Endpoint name: sagemaker-scikit-learn-2024-10-03-01-13-42-399


Monitor Endpoint Status

In [38]:
import time
import boto3

sm_client = boto3.client('sagemaker')
endpoint_name = sklearn_predictor.endpoint_name

while True:
    response = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = response['EndpointStatus']
    print(f"Endpoint status: {status}")

    if status == 'InService':
        print("Endpoint is successfully in service!")
        break
    elif status == 'Failed':
        print(f"Endpoint deployment failed. Reason: {response['FailureReason']}")
        break

    time.sleep(30)  # Wait for 30 seconds before checking the status again


Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint status: Creating
Endpoint sta

In [39]:
import json

# Prepare payload for prediction
input_data = '10,20'  # Example input features in CSV format (metric1, metric2)
response = sklearn_predictor.predict(input_data)
print(f"Prediction result: {response}")

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint sagemaker-scikit-learn-2024-10-03-01-13-42-399 of account 807494057176 not found.