-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
Hi, I developed a project using Python 3.7 but have since downgraded to Python 3.6 to make it compatable with SageMaker. I can run the project using a Python 3.6 virtual environment, but when I try to train a model in script mode usign the code below, sagemaker throws the following error:
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2. 's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2. 'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2. Creating tmp291cqf55_algo-1-fj9uv_1 ... Attaching to tmp291cqf55_algo-1-fj9uv_12mdone algo-1-fj9uv_1 | 2020-07-13 15:06:21,068 sagemaker-containers INFO Imported framework sagemaker_tensorflow_container.training algo-1-fj9uv_1 | 2020-07-13 15:06:21,075 sagemaker-containers INFO No GPUs detected (normal if no gpus installed) algo-1-fj9uv_1 | 2020-07-13 15:06:21,510 sagemaker-containers INFO Installing module with the following command: algo-1-fj9uv_1 | /usr/bin/python3 -m pip install . -r requirements.txt algo-1-fj9uv_1 | Processing /opt/ml/code algo-1-fj9uv_1 | ERROR: Package 'multicat-churn' requires a different Python: 3.6.9 not in '>=3.7.0' algo-1-fj9uv_1 | 2020-07-13 15:06:22,580 sagemaker-containers ERROR InstallModuleError: algo-1-fj9uv_1 | Command "/usr/bin/python3 -m pip install . -r requirements.txt" tmp291cqf55_algo-1-fj9uv_1 exited with code 1 Aborting on container exit...
The error is in this line:
algo-1-fj9uv_1 | ERROR: Package 'multicat-churn' requires a different Python: 3.6.9 not in '>=3.7.0'
This is really difficult to debug because it doesn't say why it thinks that the project requires Python 3.7. I have looked around the code countless times and cannot figure out why it is doing this.. I will link my train.py code below and the code I use to start the training job..
Train.py
import argparse
import numpy as np
import os
import tensorflow as tf
import joblib
from tensorflow.keras.optimizers import Adam
from multicat_churn.src.project_specific_utils.modelling import get_model, get_train_test_data
from multicat_churn.src.project_specific_utils.general import load_file, fs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--gpu-count', type=int, default=os.environ['SM_NUM_GPUS'])
# Data directories
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEWST'])
# Tuneable Hyperparams
parser.add_argument('--epochs', type=int, default=10)
parser.add_argument('--learning-rate', type=float, default=0.01)
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--lr', type=float, default=0.1)
parser.add_argument('--use_batch_norm', type=int, default=0)
parser.add_argument('--use_dropout', type=int, default=1)
parser.add_argument('--use_cnn', type=int, default=1)
parser.add_argument('--use_l2_regularizer', type=int, default=1)
parser.add_argument('--cnn_kernel_size', type=int, default=3)
parser.add_argument('--cnn_pool_size', type=int, default=3)
parser.add_argument('--dropout_rate', type=float, default=0.3)
parser.add_argument('--loss', type=str, default='squared_hinge')
parser.add_argument('--width', type=int, default=32)
parser.add_argument('--depth', type=int, default=2)
# Dimensionality and masking
parser.add_argument('--time_steps', type=int)
parser.add_argument('--features', type=int)
parser.add_argument('--stock_features', type=int)
parser.add_argument('--mask_value', type=int, default=-1234)
return parser.parse_known_args()
def get_train_data(train_dir):
with fs.open(train_dir):
x_train, y_train = joblib.load(train_dir.split('s3://')[1])
print('x train', x_train.shape, 'y train', y_train.shape)
return x_train, y_train
def get_test_data(test_dir):
x_test, y_test = joblib.load(test_dir.split('s3://'))
print('x test', x_test.shape, 'y test', y_test.shape)
return x_test, y_test
if __name__ == "__main__":
args, _ = parse_args()
params = vars(args)
x_train, y_train = get_train_data(args.train)
x_test, y_test = get_test_data(args.test)
device = '/cpu:0'
print(device)
batch_size = args.batch_size
epochs = args.epochs
learning_rate = args.learning_rate
print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate))
with tf.device(device):
model = get_model()
optimizer = tf.keras.optimizers.SGD(learning_rate)
model.compile(
optimizer=Adam(lr=params['learning_rate']), loss=params['loss'],
metrics=['accuracy',
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.TruePositives(name='tp'),
tf.keras.metrics.FalsePositives(name='fp'),
tf.keras.metrics.TrueNegatives(name='tn'),
tf.keras.metrics.FalseNegatives(name='fn')])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
validation_data=(x_test, y_test))
# evaluate on test set
scores = model.evaluate(x_test, y_test, batch_size, verbose=2)
print("\nTest Performance :", scores)
# save model
model.save(args.model_dir + '/1')
Script used to run training job:
from sagemaker.tensorflow import TensorFlow
import sagemaker
model_dir = 's3://test-bucket-glovocds/models/test_model'
train_instance_type = 'local'
hyperparameters = {'epochs': 5, 'batch_size': 128, 'learning_rate': 0.01}
source_dir = '/home/ec2-user/SageMaker/multicat_churn'
local_estimator = TensorFlow(source_dir=source_dir,
entry_point='train.py',
model_dir=model_dir,
train_instance_type=train_instance_type,
train_instance_count=1,
hyperparameters=hyperparameters,
role=sagemaker.get_execution_role(),
base_job_name='tf-2-workflow',
framework_version='2.1',
py_version='py3')
train_dir = 's3://####/train/training_data.joblib'
test_dir = 's3:/####/test/test_data.joblib'
inputs = {'train': train_dir,
'test': test_dir}
local_estimator.fit(inputs)