Data Prep

In [38]:
# data 
import pandas as pd 
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [39]:
# set data params
np.random.seed(0)
num_pts = 600
noise_val = 0.25

# generate data
# X = 2D points, Y = class labels (0 or 1)
X, Y = make_moons(num_pts, noise=noise_val)

# Split into test and training data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
    test_size=0.25, random_state=1)

In [None]:
# plot
# points are colored by class, Y_train
# 0 labels = purple, 1 = yellow
plt.figure(figsize=(8,5))
plt.scatter(X_train[:,0], X_train[:,1], c=Y_train)
plt.title('Moon Data')
plt.show()

In [None]:
!mkdir train_data

In [None]:
pd.concat([pd.DataFrame(Y_train), pd.DataFrame(X_train)], axis=1)\
    .to_csv(os.path.join('train_data', 'train.csv'), header=False, index=False)

In [1]:
import sagemaker

In [8]:
session=sagemaker.Session()
role=sagemaker.get_execution_role()

In [15]:
bucket=session.default_bucket()
prefix='pytorch_script_mode/train_data'
s3_train_data=session.upload_data('train_data',bucket=bucket,key_prefix=prefix)

In [27]:
from sagemaker.pytorch import PyTorch

In [30]:
pytorch=PyTorch(entry_point='train.py',
                source_dir='pytorch_scripts',
                instance_count=1,
                instance_type='ml.m5.large',
                role=role,
                framework_version='1.0',
                py_version='py3',
                use_spot_instances=True,
                max_wait=3600,
                max_run=3600,
                session=session,
                hyperparameters={
                    'input_dim': 2,  # num of features
                        'hidden_dim': 20,
                        'output_dim': 1,
                        'epochs': 2 # could change to higher
                })

In [31]:
pytorch.fit({'train':s3_train_data})

2021-05-02 02:17:49 Starting - Starting the training job...
2021-05-02 02:18:19 Starting - Launching requested ML instancesProfilerReport-1619921869: InProgress
......
2021-05-02 02:19:19 Starting - Preparing the instances for training......
2021-05-02 02:20:19 Downloading - Downloading input data...
2021-05-02 02:20:50 Training - Training image download completed. Training in progress.
2021-05-02 02:20:50 Uploading - Uploading generated training model
2021-05-02 02:20:50 Completed - Training job completed
[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-02 02:20:38,332 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-02 02:20:38,336 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-05-02 02:20:38,347 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m202

In [32]:
pytorch.model_data

's3://sagemaker-us-east-1-032934527328/sagemaker-pytorch-2021-05-02-02-17-48-398/output/model.tar.gz'

In [43]:
# This object's predict method fails - its expecting a float but receives a double.. This is 
# probably because its expecting us to define the predict_fn where the invoked request will be
# converted to input that the Pytorch model likes

# Thus trying an alternative approach below 
#Load the model from tar.gz, along with providing a script predict.py that contains the predict_fn
# lets see if defining the predict function solves the issue
pytorch_endp=pytorch.deploy(initial_instance_count=1,instance_type='ml.m5.large')

-------------!

In [34]:
from sagemaker.pytorch import PyTorchModel

In [46]:
pytorch_loaded_model=PyTorchModel(model_data=pytorch.model_data,
                     role = role,
                     framework_version='1.0',
                    py_version='py3',
                     entry_point='predict.py',
                     source_dir='pytorch_scripts')

In [47]:
pytorch_endp_loaded_model=pytorch_loaded_model.deploy(initial_instance_count=1,instance_type='ml.m5.large')

-----------------------------*

UnexpectedStatusException: Error hosting endpoint sagemaker-pytorch-2021-05-02-02-56-15-772: Failed. Reason:  The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

In [36]:
def evaluate(predictor, test_features, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # rounding and squeezing array
    test_preds = np.squeeze(np.round(predictor.predict(test_features)))
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    # print metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actuals'], colnames=['predictions']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy}




In [44]:
metrics = evaluate(pytorch_endp2, X_test, Y_test, True)

AttributeError: 'PyTorchModel' object has no attribute 'predict'

In [40]:
metrics = evaluate(pytorch_endp, X_test, Y_test, True)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from model with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/sagemaker-pytorch-2021-05-02-02-29-39-199 in account 032934527328 for more information.

In [24]:
!ls pytorch_scripts

model.py  train.py
