Skip to content

Sagemaker tensorflow container thinks I'm using Python 3.7 and fails to built #1704

@qemtek

Description

@qemtek

Hi, I developed a project using Python 3.7 but have since downgraded to Python 3.6 to make it compatable with SageMaker. I can run the project using a Python 3.6 virtual environment, but when I try to train a model in script mode usign the code below, sagemaker throws the following error:

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2. 's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2. 'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2. Creating tmp291cqf55_algo-1-fj9uv_1 ... Attaching to tmp291cqf55_algo-1-fj9uv_12mdone algo-1-fj9uv_1 | 2020-07-13 15:06:21,068 sagemaker-containers INFO Imported framework sagemaker_tensorflow_container.training algo-1-fj9uv_1 | 2020-07-13 15:06:21,075 sagemaker-containers INFO No GPUs detected (normal if no gpus installed) algo-1-fj9uv_1 | 2020-07-13 15:06:21,510 sagemaker-containers INFO Installing module with the following command: algo-1-fj9uv_1 | /usr/bin/python3 -m pip install . -r requirements.txt algo-1-fj9uv_1 | Processing /opt/ml/code algo-1-fj9uv_1 | ERROR: Package 'multicat-churn' requires a different Python: 3.6.9 not in '>=3.7.0' algo-1-fj9uv_1 | 2020-07-13 15:06:22,580 sagemaker-containers ERROR InstallModuleError: algo-1-fj9uv_1 | Command "/usr/bin/python3 -m pip install . -r requirements.txt" tmp291cqf55_algo-1-fj9uv_1 exited with code 1 Aborting on container exit...

The error is in this line:
algo-1-fj9uv_1 | ERROR: Package 'multicat-churn' requires a different Python: 3.6.9 not in '>=3.7.0'

This is really difficult to debug because it doesn't say why it thinks that the project requires Python 3.7. I have looked around the code countless times and cannot figure out why it is doing this.. I will link my train.py code below and the code I use to start the training job..

Train.py

import argparse
import numpy as np
import os
import tensorflow as tf
import joblib
from tensorflow.keras.optimizers import Adam
from multicat_churn.src.project_specific_utils.modelling import get_model, get_train_test_data
from multicat_churn.src.project_specific_utils.general import load_file, fs

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--gpu-count', type=int, default=os.environ['SM_NUM_GPUS'])
    # Data directories
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEWST'])
    # Tuneable Hyperparams
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--learning-rate', type=float, default=0.01)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--lr', type=float, default=0.1)
    parser.add_argument('--use_batch_norm', type=int, default=0)
    parser.add_argument('--use_dropout', type=int, default=1)
    parser.add_argument('--use_cnn', type=int, default=1)
    parser.add_argument('--use_l2_regularizer', type=int, default=1)
    parser.add_argument('--cnn_kernel_size', type=int, default=3)
    parser.add_argument('--cnn_pool_size', type=int, default=3)
    parser.add_argument('--dropout_rate', type=float, default=0.3)
    parser.add_argument('--loss', type=str, default='squared_hinge')
    parser.add_argument('--width', type=int, default=32)
    parser.add_argument('--depth', type=int, default=2)
    # Dimensionality and masking
    parser.add_argument('--time_steps', type=int)
    parser.add_argument('--features', type=int)
    parser.add_argument('--stock_features', type=int)
    parser.add_argument('--mask_value', type=int, default=-1234)

    return parser.parse_known_args()


def get_train_data(train_dir):
    with fs.open(train_dir):
        x_train, y_train = joblib.load(train_dir.split('s3://')[1])
        print('x train', x_train.shape, 'y train', y_train.shape)

    return x_train, y_train


def get_test_data(test_dir):
    x_test, y_test = joblib.load(test_dir.split('s3://'))
    print('x test', x_test.shape, 'y test', y_test.shape)

    return x_test, y_test


if __name__ == "__main__":
    args, _ = parse_args()
    params = vars(args)

    x_train, y_train = get_train_data(args.train)
    x_test, y_test = get_test_data(args.test)

    device = '/cpu:0'
    print(device)
    batch_size = args.batch_size
    epochs = args.epochs
    learning_rate = args.learning_rate
    print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate))

    with tf.device(device):
        model = get_model()
        optimizer = tf.keras.optimizers.SGD(learning_rate)
        model.compile(
            optimizer=Adam(lr=params['learning_rate']), loss=params['loss'],
            metrics=['accuracy',
                     tf.keras.metrics.Recall(name='recall'),
                     tf.keras.metrics.AUC(name='auc'),
                     tf.keras.metrics.Precision(name='precision'),
                     tf.keras.metrics.TruePositives(name='tp'),
                     tf.keras.metrics.FalsePositives(name='fp'),
                     tf.keras.metrics.TrueNegatives(name='tn'),
                     tf.keras.metrics.FalseNegatives(name='fn')])

        model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
                  validation_data=(x_test, y_test))

        # evaluate on test set
        scores = model.evaluate(x_test, y_test, batch_size, verbose=2)
        print("\nTest Performance :", scores)

        # save model
        model.save(args.model_dir + '/1')

Script used to run training job:

from sagemaker.tensorflow import TensorFlow
import sagemaker

model_dir = 's3://test-bucket-glovocds/models/test_model'
train_instance_type = 'local'
hyperparameters = {'epochs': 5, 'batch_size': 128, 'learning_rate': 0.01}
source_dir = '/home/ec2-user/SageMaker/multicat_churn'
local_estimator = TensorFlow(source_dir=source_dir,
                             entry_point='train.py',
                             model_dir=model_dir,
                             train_instance_type=train_instance_type,
                             train_instance_count=1,
                             hyperparameters=hyperparameters,
                             role=sagemaker.get_execution_role(),
                             base_job_name='tf-2-workflow',
                             framework_version='2.1',
                             py_version='py3')

train_dir = 's3://####/train/training_data.joblib'
test_dir = 's3:/####/test/test_data.joblib'
inputs = {'train': train_dir,
          'test': test_dir}

local_estimator.fit(inputs)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions