In [1]:
!pip install joblib

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import boto3
import sagemaker
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()
print(bucket)

sagemaker-us-east-1-077458658795


In [26]:
# load data

data_dir = 'capstone_data'

prefix = 'capstone_project'

# already ran
# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [7]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

capstone_project/test_lstm.csv
capstone_project/train_lstm.csv
Test passed!


In [35]:
# create model
from sagemaker.pytorch import PyTorch
model = PyTorch(entry_point='train.py',
                source_dir='lstm',
                role=role,
                train_instance_count=1, 
                train_instance_type='ml.c4.xlarge',
                sagemaker_session=sagemaker_session,
                framework_version='1.8.1',
                py_version='py3',
                hyperparameters={
                                 'epochs': 10,
                                 'hidden_dim': 8,
                                }
               )

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [39]:
%%time

# Train your estimator on S3 training data
model.fit({'train': f's3://{bucket}/{prefix}'})

2021-07-22 06:23:44 Starting - Starting the training job...
2021-07-22 06:24:09 Starting - Launching requested ML instancesProfilerReport-1626935024: InProgress
...
2021-07-22 06:24:43 Starting - Preparing the instances for training............
2021-07-22 06:26:36 Downloading - Downloading input data...
2021-07-22 06:27:10 Training - Downloading the training image..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-07-22 06:27:21,859 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-07-22 06:27:21,861 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-22 06:27:21,871 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-07-22 06:27:24,957 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-07-22 06:27:25,557 s


2021-07-22 06:27:39 Uploading - Uploading generated training model
2021-07-22 06:27:39 Failed - Training job failed


UnexpectedStatusException: Error for Training job pytorch-training-2021-07-22-06-23-44-364: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 train.py --epochs 100 --hidden_dim 8"
Traceback (most recent call last):
  File "train.py", line 128, in <module>
    train(model, train_loader, args.epochs, criterion, optimizer, device)
  File "train.py", line 82, in train
    y_pred = model(batch_x)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 918, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/ml/code/model.py", line 19, in forward
    lstm_out, _ = self.lstm(x)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 918, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/rnn.py", line 659, in forward
    self.check_forward_args(input, hx, batch_sizes)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/rnn.py", line 605, in check_forward_args
    self.check_input(input, batch_sizes)
  File "/opt/conda/lib/python3.6/s

In [None]:
%%time

# uncomment, if needed
# from sagemaker.pytorch import PyTorchModel


# deploy your model to create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

In [None]:
# test model

import os

# read in test data, assuming it is stored locally
test_lstm = pd.read_csv(os.path.join(data_dir, "test_lstm.csv"), header=None, names=None)

# labels are in the first column
test_y = test_lstm.iloc[:,0]
test_x = test_lstm.iloc[:,1:]

In [None]:
test_y_preds = predictor.predict(test_x)

In [None]:
rmse = mean_squared_error(test_lstm[0], test_y_preds, squared=False)
print(rmse)

In [None]:
test_lstm[0].plot()
test_y_preds.plot()