In [None]:
import datetime
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker

sm_boto3 = boto3.client('sagemaker')

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket='<bucket_name>' # put your s3 bucket name here, and create s3 bucket
prefix = '<prefix>'
# customize to your bucket where you have stored the data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

print('Using bucket ' + bucket)

In [None]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path='./data/train_data.csv', bucket=bucket,
    key_prefix=prefix)

validpath = sess.upload_data(
    path='./data/valid_data.csv', bucket=bucket,
    key_prefix=prefix)

testpath = sess.upload_data(
    path='./data/test_data.csv', bucket=bucket,
    key_prefix=prefix)

In [None]:
%%writefile failure_script.py

import argparse
import os
import subprocess
import sys

# subprocess.call([sys.executable, "conda", "install", "-y", "xgboost"])
subprocess.call([sys.executable, '-m', 'pip', 'install', '-U', 'xgboost']) 

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.externals import joblib
import xgboost as Xgb
from sklearn.ensemble import RandomForestRegressor

print("XGBoost version installed: ", Xgb.__version__)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.options.mode.chained_assignment = None

# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf



if __name__ =='__main__':

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--valid', type=str, default=os.environ.get('SM_CHANNEL_VALID'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default="training.csv")
    parser.add_argument('--valid-file', type=str, default="validation.csv")
    parser.add_argument('--test-file', type=str, default="testing.csv")
    parser.add_argument('--q', type=str, default=30)
    parser.add_argument('--scale-pos-weight', type=str, default=1)
    parser.add_argument('--target', type=str)
    args, _ = parser.parse_known_args()
    
    print('reading data')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    valid_df = pd.read_csv(os.path.join(args.valid, args.valid_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    print(train_df.head())

#     train_df = pd.read_csv(args.train)
#     valid_df = pd.read_csv(args.valid)
#     test_df = pd.read_csv(args.test)
    
    print('building training and testing datasets')
    x_train = train_df.drop(args.target, axis=1)
    x_valid = valid_df.drop(args.target, axis=1)
    x_test = test_df.drop(args.target, axis=1)
    y_train = train_df[args.target]
    y_valid = valid_df[args.target]
    y_test = test_df[args.target]
    
    print("Training shape: ", x_train.shape, y_train.shape)
    print("Validation shape: ", x_valid.shape, y_valid.shape)
    print("Testing shape: ", x_test.shape, y_test.shape)

    # train
    print('training model')
    # XGboost to determine the variable importance
#     xgbm = Xgb.XGBClassifier(max_depth=50, 
#                          learning_rate=0.2, 
#                          n_estimators=250, 
#                          n_jobs=-1, 
#                          colsample_bytree=0.5, 
#                          colsample_bynode=0.5, 
#                          reg_alpha=0.5, 
#                          reg_lambda=0.5, 
#                          random_state=76
#                         )

#     xgbm.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric="auc", early_stopping_rounds=5)
    
#     var_imp = pd.DataFrame({'variable':x_train.columns, 'importance':xgbm.feature_importances_}).sort_values('importance', ascending=False)
#     imp_cutoff = np.percentile(var_imp.importance, q=args.q)
    
    xgbm = Xgb.XGBClassifier(max_depth=40, 
                         scale_pos_weight=args.scale_pos_weight,
#                         learning_rate=0.5, 
                        n_estimators=250, 
                        verbosity=1,
#                         objective='', 
#                         booster='gbtree', 
                        n_jobs=-1,
                        subsample=0.9, 
                        colsample_bytree=0.5,
#                         colsample_bynode=0.5, 
#                         reg_alpha=0.1, 
#                         reg_lambda=0.1, 
                        random_state=21
                       )
    xgbm.fit(x_train.values, y_train, 
             eval_set=[(x_valid.values, y_valid)], 
             verbose=True, early_stopping_rounds=3, eval_metric="auc")

    # print abs error
#     print('validating model')
#     valid_maerror = mean_absolute_error(y_valid, model.predict(X_valid[var_imp.loc[var_imp.importance > np.percentile(var_imp.importance, q=args.q),'variable']]))
#     test_maerror = mean_absolute_error(y_test, model.predict(X_test[var_imp.loc[var_imp.importance > np.percentile(var_imp.importance, q=args.q),'variable']]))
    
#     # print couple perf metrics
#     for q in [10, 50, 90]:
#         print('Validation:: AE-at-' + str(q) + 'th-percentile: '+ str(np.percentile(a=valid_maerror, q=q)))
#         print('Testing:: AE-at-' + str(q) + 'th-percentile: '+ str(np.percentile(a=test_maerror, q=q)))
        
    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(xgbm, path)
    print('model persisted at ' + path)

In [None]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='failure_script.py',
    base_job_name="failure-prediction-" + datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'),
    role = get_execution_role(),
    train_instance_count=1,
    output_path="s3://project-works-us/cairn-energy/mlsource/failure/model",
    code_location="s3://project-works-us/cairn-energy/mlsource/failure/training-output",
    train_instance_type='ml.m5.24xlarge',
    framework_version='0.20.0',
#     metric_definitions=[
#         {'Name': 'median-AE',
#          'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters = {
                       'target': 'failed_state',
                       'scale-pos-weight': 67.57894736842105
    })

In [None]:
sklearn_estimator.fit({'train':trainpath, 
                       'valid': validpath,
                       'test': testpath
                      }, wait=False)

In [None]:
%%time

endpoint_name = "failure-prediction-ep-" + datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

sklearn_estimator.deploy(
    instance_type='ml.c5.large',
    initial_instance_count=1,
    endpoint_name=endpoint_name
)

In [None]:
endpoint_name='failure-prediction-ep-2019-12-19-11-38-28'
predictor = sagemaker.sklearn.model.SKLearnPredictor(endpoint_name=endpoint_name)