# imports

In [2]:
import time
import boto3
import botocore
import numpy as np  
import pandas as pd  
import sagemaker
from time import gmtime, strftime, sleep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sklearn.metrics import roc_auc_score
from sagemaker.experiments.run import Run, load_run

sagemaker.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


'2.215.0'

In [None]:
boto_session = boto3.Session()
region = boto_session.region_name
print(f"Region--{region}")
sm_role = sagemaker.get_execution_role()
print(f"Sagemaker Role--{sm_role}")
sagemaker_session = sagemaker.Session()
default_bucket = sagemaker_session.default_bucket()
print(f"Default bucket--{default_bucket}")

# SAGEMAKER CONTAINERS


CREATE SageMaker processing and training jobs
In this step you move data processing and model training into SageMaker Docker containers and use SageMaker Python SDK to interact with SageMaker.



* **SageMaker** makes use of Docker containers to enable developers to process data, train and deploy models. 

* **SageMaker** also provides pre-build containers with popular data processing frameworks and ML algorithms. All SageMaker built-in algorithms are delivered as Docker containers.

* **Containers** allow developers and data scientists to package software into standardized units that run consistently on any platform that supports Docker. 

* **Containers** ensure that code, runtime, system tools, system libraries, and settings are all in the same place, isolating them from the execution environment. It guarantees a consistent runtime experience regardless of where a container is being run.




# process data in sagemaker processing jobs

Process data in SageMaker processing job

* Use **SageMaker Processing** by simply providing a Python data preprocessing script and choosing a SageMaker SDK processor class. 
You must upload the input data to S3 and specify an S3 location for output data. 

* **SageMaker Processing** automatically loads the input data from S3 and uploads transformed data back to S3 when the job is complete. 

* The processing container image can either be an Amazon SageMaker built-in image or a custom image that you provide. 

* The underlying infrastructure for a Processing job is fully managed by Amazon SageMaker. 

* Cluster resources are provisioned for the duration of your job, and cleaned up when a job completes.

* Your input data must be stored in an Amazon S3 bucket. Alternatively, you can use Amazon Athena or Amazon Redshift as input sources.



In [3]:
# import config
import os
import yaml
with open(os.path.abspath(os.path.join(os.pardir,"config.yml")),"r") as f:
    config = yaml.load(f,Loader=yaml.FullLoader)
print(config)

{'features': ['dteday', 'season', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'yr', 'mnth'], 'holiday_mappings': {'No': 1, 'Yes': 0}, 'hr_mappings': {'10am': 9, '10pm': 8, '11am': 11, '11pm': 7, '12am': 5, '12pm': 17, '1am': 4, '1pm': 16, '2am': 3, '2pm': 15, '3am': 1, '3pm': 18, '4am': 0, '4pm': 19, '5am': 2, '5pm': 23, '6am': 6, '6pm': 22, '7am': 12, '7pm': 20, '8am': 21, '8pm': 14, '9am': 13, '9pm': 10}, 'mnth_mappings': {'April': 5, 'August': 11, 'December': 2, 'February': 1, 'January': 0, 'July': 10, 'June': 9, 'March': 3, 'May': 7, 'November': 4, 'October': 6, 'September': 8}, 's3-bucket': {'bucket_name': 'sagemaker-us-east-1-644383320443', 'bucket_prefix': 'siemens-poc/', 'training_file_key': 'siemens-poc/bike-sharing-dataset.csv'}, 'sagemaker': {'domain_id': 'd-zjfao8azi0ng', 'region': 'us-east-1', 'role': 'arn:aws:iam::644383320443:role/service-role/AmazonSageMaker-ExecutionRole-20240109T220483'}, 'season

In [5]:
!aws s3 ls {config['s3-bucket']['bucket_name']}/{config['s3-bucket']['bucket_prefix']} --recursive

# write preprocessing script 
> for pre-processing done on the data set

In [28]:
%%writefile ../scripts/preprocessing.py

import pandas as pd
import numpy as np
import argparse
import os

# Treating 'yr' column as Ordinal categorical variable, assign higher value to 2012
yr_mapping = {2011: 0, 2012: 1}
# Treat 'mnth' column as Ordinal categorical variable, and assign values accordingly
mnth_mapping = {'January': 0, 'February': 1, 'December': 2, 'March': 3, 'November': 4, 'April': 5,
                'October': 6, 'May': 7, 'September': 8, 'June': 9, 'July': 10, 'August': 11}
# Treat 'season' column as Ordinal categorical variable, and assign values accordingly
season_mapping = {'spring': 0, 'winter': 1, 'summer': 2, 'fall': 3}
# Map weather situation
weather_mapping = {'Heavy Rain': 0, 'Light Rain': 1, 'Mist': 2, 'Clear': 3}
# Map holiday
holiday_mapping = {'Yes': 0, 'No': 1}
# Map workingday
workingday_mapping = {'No': 0, 'Yes': 1}
# Map hour
hour_mapping = {'4am': 0, '3am': 1, '5am': 2, '2am': 3, '1am': 4, '12am': 5, '6am': 6, '11pm': 7, '10pm': 8,
                '10am': 9, '9pm': 10, '11am': 11, '7am': 12, '9am': 13, '8pm': 14, '2pm': 15, '1pm': 16,
                '12pm': 17, '3pm': 18, '4pm': 19, '7pm': 20, '8am': 21, '6pm': 22, '5pm': 23}


# find numerical and categorical variables
def get_feature_categories(df):
    unused_colms = ['dteday', 'casual', 'registered']   # unused columns will be removed at later stage
    target_col = ['cnt']

    numerical_features = []
    categorical_features = []

    for col in df.columns:
        if col not in target_col + unused_colms:
            if df[col].dtypes == 'float64':
                numerical_features.append(col)
            else:
                categorical_features.append(col)
    print('Number of numerical variables: {}'.format(len(numerical_features)),":" , numerical_features)
    print('Number of categorical variables: {}'.format(len(categorical_features)),":" , categorical_features)
    return numerical_features , categorical_features , unused_colms


 # Working on `dteday` column to extract year and month
def get_year_and_month(dataframe):
    df = dataframe.copy()
    # convert 'dteday' column to Datetime datatype
    df['dteday'] = pd.to_datetime(df['dteday'], format='%Y-%m-%d')
    # Add new features 'yr' and 'mnth
    df['yr'] = df['dteday'].dt.year
    df['mnth'] = df['dteday'].dt.month_name()
    print("Added yr and month variables to the dataframe--")
    return df


# Function to impute weekday by extracting day name from the date column

def impute_weekday(dataframe):
    df = dataframe.copy()
    wkday_null_idx = df[df['weekday'].isnull() == True].index
    # print(len(wkday_null_idx))
    df.loc[wkday_null_idx, 'weekday'] = df.loc[wkday_null_idx, 'dteday'].dt.day_name().apply(lambda x: x[:3])
    print("imputed weekday in dataframe--")
    return df


# Function to handle outliers for a single column

def handle_outliers(dataframe, colm):
    df = dataframe.copy()
    q1 = df.describe()[colm].loc['25%']
    q3 = df.describe()[colm].loc['75%']
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    for i in df.index:
        if df.loc[i,colm] > upper_bound:
            df.loc[i,colm]= upper_bound
        if df.loc[i,colm] < lower_bound:
            df.loc[i,colm]= lower_bound
    print(f"outliers handled in column--{colm}")
    return df


# Treating 'weekday' column as a Nominal categorical variable, perform one-hot encoding
from sklearn.preprocessing import  OneHotEncoder
# build encoder
encoder = OneHotEncoder()

## Function for pre-processing the dataset

def pre_process(dataframe):
    df = dataframe.copy()
    df = get_year_and_month(df)
    df = impute_weekday(df)
    df['weathersit'].fillna('Clear', inplace=True)
    numerical_features , categorical_features , unused_colms = get_feature_categories(df)
    for col in numerical_features:
        df = handle_outliers(df, col)
    df['yr'] = df['yr'].apply(lambda x: yr_mapping[x])
    df['mnth'] = df['mnth'].apply(lambda x: mnth_mapping[x])
    df['season'] = df['season'].apply(lambda x: season_mapping[x])
    df['weathersit'] = df['weathersit'].apply(lambda x: weather_mapping[x])
    df['holiday'] = df['holiday'].apply(lambda x: holiday_mapping[x])
    df['workingday'] = df['workingday'].apply(lambda x: workingday_mapping[x])
    df['hr'] = df['hr'].apply(lambda x: hour_mapping[x])
    encoder.fit(df[['weekday']])
    enc_wkday_features = encoder.get_feature_names_out(['weekday'])
    encoded_weekday = encoder.transform(df[['weekday']]).toarray()
    df[enc_wkday_features] = encoded_weekday
    # drop not required columns
    unused_colms.append('weekday')
    df.drop(labels = unused_colms, axis = 1, inplace = True)
    print("preprocessing done !")
    return df




def _parse_args():
    
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bits-webminar-june-24/bike-sharing-dataset.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    
    return parser.parse_known_args()


if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()
    
    target_col = "cnt"
    
    # Load data
    df_data = pd.read_csv(os.path.join(args.filepath, args.filename))
    try:
        df_data.drop(['Unnamed: 0'],axis=1 , inplace=True)
    except Exception as e:
        print(f"Exception occured in dropping column--unnamed: 0-->{e}")
    processed_data = pre_process(df_data)

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
    processed_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(processed_data)), int(0.9 * len(processed_data))],)

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

    
    # Save datasets locally
    train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=True)
    validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=True)
    test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=True)
    test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=True)
    
    # Save the baseline dataset for model monitoring
    processed_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=True, header=False)
    
    print("## Processing complete. Exiting.")


Overwriting ../scripts/preprocessing.py


# set s3 paths

In [6]:
def get_default_bucket():
    boto_session = boto3.Session()
    # region = boto_session.region_name
    # print(f"Region--{region}")
    # sm_role = sagemaker.get_execution_role()
    # print(f"Sagemaker Role--{sm_role}")
    sagemaker_session = sagemaker.Session()
    default_bucket = sagemaker_session.default_bucket()
    print(f"Default bucket--{default_bucket}")
    return default_bucket

In [7]:
PREFIX = 'bits-webminar-june-24/'
DEFAULT_BUCKET = get_default_bucket()

Default bucket--sagemaker-eu-north-1-058264393695
sagemaker-eu-north-1-058264393695


In [8]:

# bucket_name = config['s3-bucket']['bucket_name']
# bucket_prefix = f'{config["s3-bucket"]["bucket_prefix"]}conatiner-jobs'
# print(f"bucket_prefix for containers--> {DEFAULT_BUCKET}")
# input_s3_url = f's3://{bucket_name}/{config["s3-bucket"]["training_file_key"]}'
# print(f"input_s3_url--> {input_s3_url}")

# train_s3_url = f"s3://{bucket_name}/{bucket_prefix}/train"
# print(f"train_s3_url--> {train_s3_url}")

# validation_s3_url = f"s3://{bucket_name}/{bucket_prefix}/validation"
# print(f"validation_s3_url--> {validation_s3_url}")

# test_s3_url = f"s3://{bucket_name}/{bucket_prefix}/test"
# print(f"test_s3_url--> {test_s3_url}")

# baseline_s3_url = f"s3://{bucket_name}/{bucket_prefix}/baseline"
# print(f"baseline_s3_url--> {baseline_s3_url}")

In [31]:
input_s3_url ='s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/bike-sharing-dataset.csv'
print(f"input_s3_url--> {input_s3_url}")
train_s3_url ='s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/train_data.csv'
print(f"train_s3_url--> {train_s3_url}")
test_s3_url ='s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/test_data.csv'
print(f"test_s3_url--> {test_s3_url}")
validation_s3_url ='s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/validation_data.csv'
print(f"test_s3_url--> {test_s3_url}")
baseline_s3_url = f"s3://{DEFAULT_BUCKET}/{PREFIX}/baseline"
print(f"baseline_s3_url--> {baseline_s3_url}")

input_s3_url--> s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/bike-sharing-dataset.csv
train_s3_url--> s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/train_data.csv
test_s3_url--> s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/test_data.csv
test_s3_url--> s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/test_data.csv
baseline_s3_url--> s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24//baseline


In [25]:
framework_version = "0.23-1"
processing_instance_type = "ml.t3.medium"
processing_instance_count = 1

# create an experiment run

In [11]:
experiment_name = "bits-webminar-june-24-21-06-2024-15-11-46"
# using the existing experiment 

In [12]:
import sys
from pathlib import Path

root_dir = Path('../scripts').resolve().parents[0] 
print(root_dir)
sys.path.append(str(root_dir))

/root/bits-webminar-june-24/ml


In [13]:
import scripts.utils as utils
run_suffix = utils.get_current_ist().strftime("%d-%m-%Y-%H-%M-%S")
run_suffix

'21-06-2024-15-22-58'

In [14]:
sagemaker_session = sagemaker.Session()
sm_client = sagemaker_session.sagemaker_client

In [15]:
run_name = f"container-processing-{run_suffix}"

with Run(experiment_name=experiment_name,
         run_name=run_name,
         run_display_name="container-processing",
         sagemaker_session=sagemaker_session
        ) as run:
    run.log_parameters(
        {
            "train": 0.7,
            "validate": 0.2,
            "test": 0.1
        }
    )
   
    experiment_config = run.experiment_config
    # time.sleep(8) # wait until resource tags are propagated to the run

# create a sklearn processor --IMAGE

In [17]:
# sm_role = config['sagemaker']['role']
# sm_role

sm_role = sagemaker.get_execution_role()
print(f"Sagemaker Role--{sm_role}")

Sagemaker Role--arn:aws:iam::058264393695:role/service-role/AmazonSageMaker-ExecutionRole-20240621T001438


In [26]:
sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=sm_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count, 
    base_job_name='container-processing',
    sagemaker_session=sagemaker_session,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


# define processing job

In [22]:
processing_inputs = [
        ProcessingInput(
            source=input_s3_url, 
            destination="/opt/ml/processing/input",
            s3_input_mode="File",
            s3_data_distribution_type="ShardedByS3Key"
        )
    ]

processing_outputs = [
        ProcessingOutput(
            output_name="train_data", 
            source="/opt/ml/processing/output/train",
            destination=train_s3_url,
        ),
        ProcessingOutput(
            output_name="validation_data", 
            source="/opt/ml/processing/output/validation", 
            destination=validation_s3_url
        ),
        ProcessingOutput(
            output_name="test_data", 
            source="/opt/ml/processing/output/test", 
            destination=test_s3_url
        ),
        ProcessingOutput(
            output_name="baseline_data", 
            source="/opt/ml/processing/output/baseline", 
            destination=baseline_s3_url
        ),
    ]

# start the sagemaker processing job

In [32]:
try:
    sklearn_processor.run(
        inputs=processing_inputs,
        outputs=processing_outputs,
        code='../scripts/preprocessing.py',
        wait=True,
        experiment_config=experiment_config,
        # arguments = ['arg1', 'arg2'],
    )
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e

INFO:sagemaker:Creating processing-job with name container-processing-2024-06-21-11-24-07-988


...........................................................................
[34mTraceback (most recent call last):
  File "/opt/ml/processing/input/code/preprocessing.py", line 140, in <module>
    df_data = pd.read_csv(os.path.join(args.filepath, args.filename))
  File "/miniconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 686, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/miniconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 452, in _read
    parser = TextFileReader(fp_or_buf, **kwds)
  File "/miniconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 946, in __init__
    self._make_engine(self.engine)
  File "/miniconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 1178, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/miniconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 2008, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "pandas/_libs/parsers.pyx"

UnexpectedStatusException: Error for Processing job container-processing-2024-06-21-11-24-07-988: Failed. Reason: AlgorithmError: See job logs for more information

# Model training in SageMaker training job

In [23]:
# get training container uri
# region = config['sagemaker']['region']
training_image = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.5-1")
print(training_image)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1


In [32]:
train_instance_count = 1
train_instance_type = "ml.m5.xlarge"

# Define where the training job stores the model artifact
output_s3_url = f"s3://{config['s3-bucket']['bucket_name']}/{config['s3-bucket']['bucket_prefix']}training-output"
output_s3_url

's3://sagemaker-us-east-1-644383320443/siemens-poc/training-output'

In [33]:
# Instantiate an XGBoost estimator object
estimator = sagemaker.estimator.Estimator(
    image_uri=training_image,  # XGBoost algorithm container
    instance_type=train_instance_type,  
    instance_count=train_instance_count,  
    role=sm_role,  
    max_run=20 * 60,  # Maximum allowed active runtime
    output_path=output_s3_url, # S3 location 
    sagemaker_session=sagemaker_session, # Session object  manages interactions with SageMaker API and AWS services
    base_job_name="xgboost-training", # Prefix for training job name
)

# define its hyperparameters
estimator.set_hyperparameters(
    num_round=150, # the number of rounds to run the training
    max_depth=3, # maximum depth of a tree
    eta=0.5, # step size shrinkage used in updates to prevent overfitting
    alpha=2.5, # L1 regularization term on weights
    objective="reg:squarederror",
    eval_metric="rmse", # evaluation metrics for validation data
    subsample=0.8, # subsample ratio of the training instance
    colsample_bytree=0.8, # subsample ratio of columns when constructing each tree
    min_child_weight=3, # minimum sum of instance weight (hessian) needed in a child
    early_stopping_rounds=10, # the model trains until the validation score stops improving
    verbosity=1, # verbosity of printing messages
)

In [34]:
s3_input_train = sagemaker.inputs.TrainingInput(train_s3_url, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(validation_s3_url, content_type='csv')

In [35]:
training_inputs = {'train': s3_input_train, 'validation': s3_input_validation}

In [36]:
try:
    run_suffix = utils.get_current_ist().strftime("%d-%m-%Y-%H-%M-%S")
    print(f"run_suffix--{run_suffix}")
    run_name = f"container-training-{run_suffix}"

    with Run(experiment_name=experiment_name,
             run_name=run_name,
             run_display_name="container-training",
             sagemaker_session=sagemaker_session
            ) as run:
        
        estimator.fit(
            training_inputs,
            wait=True,
            logs=False,
        ) 
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e

run_suffix--16-01-2024-00-57-28


INFO:sagemaker:Creating training-job with name: xgboost-training-2024-01-15-19-27-29-687



2024-01-15 19:27:30 Starting - Starting the training job..
2024-01-15 19:27:45 Starting - Preparing the instances for training..............
2024-01-15 19:29:04 Downloading - Downloading input data........
2024-01-15 19:29:50 Downloading - Downloading the training image.
2024-01-15 19:30:00 Training - Training image download completed. Training in progress....
2024-01-15 19:30:20 Uploading - Uploading generated training model...
2024-01-15 19:30:37 Completed - Training job completed


# output model performance

In [37]:
if estimator._current_job_name:
    training_job_name = estimator._current_job_name

In [38]:
training_job_name

'xgboost-training-2024-01-15-19-27-29-687'

In [39]:
metrics = None
while not metrics:
    metrics = sm_client.describe_training_job(
        TrainingJobName=training_job_name
        ).get("FinalMetricDataList")

    if not metrics:
        print(f"Training job {training_job_name} hasn't finished yet!")
        time.sleep(10)

train_rmse = float([m['Value'] for m in metrics if m['MetricName'] == 'train:rmse'][0])
validate_rmse = float([m['Value'] for m in metrics if m['MetricName'] == 'validation:rmse'][0])

print(f"Train RMSE: {train_rmse:.2f}, Validate RMSE: {validate_rmse:.2f}")



Train RMSE: 0.36, Validate RMSE: 0.40


In [40]:
# Print the S3 path to the model artifact:
estimator.model_data

's3://sagemaker-us-east-1-644383320443/siemens-poc/training-output/xgboost-training-2024-01-15-19-27-29-687/output/model.tar.gz'

In [49]:
destination_s3_uri=f"s3://{config['s3-bucket']['bucket_name']}/{config['s3-bucket']['bucket_prefix']}data-capture"
destination_s3_uri

's3://sagemaker-us-east-1-644383320443/siemens-poc/data-capture'

# Validate model -- deploy endpoint

In [50]:
# Real-time endpoint
from time import gmtime
endpoint_name = f"siemens-poc-xgboost-endpoint-{strftime('%d-%H-%M-%S', gmtime())}"

try:
    predictor = estimator.deploy(
        initial_instance_count=1,
        instance_type="ml.m5.xlarge",
        wait=False,  
        # data capture here, for experiment with monitoring:
        data_capture_config=sagemaker.model_monitor.DataCaptureConfig(
            enable_capture=True,
            sampling_percentage=100,
            destination_s3_uri=f"s3://{config['s3-bucket']['bucket_name']}/{config['s3-bucket']['bucket_prefix']}data-capture",
        ),
        endpoint_name=endpoint_name,
        serializer=sagemaker.serializers.CSVSerializer(),
        deserializer=sagemaker.deserializers.CSVDeserializer(),
    )
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
        predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name,
                                                  sagemaker_session=sagemaker_session,
                                                  serializer=sagemaker.serializers.CSVSerializer(),
                                                  deserializer=sagemaker.deserializers.CSVDeserializer(),
                                                 )
    else:
        raise e

INFO:sagemaker:Creating model with name: xgboost-training-2024-01-15-20-24-05-916
INFO:sagemaker:Creating endpoint-config with name siemens-poc-xgboost-endpoint-15-20-24-05
INFO:sagemaker:Creating endpoint with name siemens-poc-xgboost-endpoint-15-20-24-05


In [51]:
# Wait until the endpoint has the status InService
waiter = sagemaker_session.sagemaker_client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)

In [52]:
!aws s3 cp $test_s3_url/test_x.csv /home/sagemaker-user/siemens-poc/test_x.csv
!aws s3 cp $test_s3_url/test_y.csv /home/sagemaker-user/siemens-poc/test_y.csv

download failed: s3://sagemaker-us-east-1-644383320443/siemens-poc/conatiner-jobs/test/test_x.csv to ../../test_x.csv [Errno 28] No space left on device: '/home/sagemaker-user/siemens-poc/test_x.csv.aC0c4FdA'
download failed: s3://sagemaker-us-east-1-644383320443/siemens-poc/conatiner-jobs/test/test_y.csv to ../../test_y.csv [Errno 28] No space left on device: '/home/sagemaker-user/siemens-poc/test_y.csv.Ae4B5AB1'


In [53]:
test_s3_url

's3://sagemaker-us-east-1-644383320443/siemens-poc/conatiner-jobs/test'

In [57]:
# test_x = pd.read_csv("tmp/test_x.csv", names=[f'{i}' for i in range(59)])
# test_y = pd.read_csv("tmp/test_y.csv", names=['y'])
import io
s3_client = boto3.client("s3")
bucket_name = config['s3-bucket']['bucket_name']
file_key  = "siemens-poc/conatiner-jobs/test/test_x.csv"
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
file_content = response['Body'].read()
test_x = pd.read_csv(io.BytesIO(file_content), encoding='utf-8')
test_x.head(3)

Unnamed: 0,1,22,1.1,0,3,8.919999999999998,7.000999999999998,66.0,15.0013,1.2,4,0.0,0.0.1,0.0.2,1.0,0.0.3,0.0.4,0.0.5
0,3,11,1,1,3,26.78,30.002,70.0,11.0014,0,9,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,9,1,1,2,7.98,5.0012,81.0,19.0012,0,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1,1,0,1,7.98,5.0012,87.0,19.0012,0,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [58]:
file_key  = "siemens-poc/conatiner-jobs/test/test_y.csv"
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
file_content = response['Body'].read()
test_y = pd.read_csv(io.BytesIO(file_content), encoding='utf-8')
test_y.head(3)

Unnamed: 0,276
0,145
1,75
2,9


In [59]:
predictions = np.array(predictor.predict(test_x.values), dtype=float).squeeze()
predictions

array([3.0410924 , 1.20082128, 1.48737872, ..., 1.27996016, 2.97838068,
       0.22001582])

In [60]:
# evaluate predictions 
test_results = pd.concat(
    [
        pd.Series(predictions, name="y_pred", index=test_x.index),
        test_x,
    ],
    axis=1,
)
test_results.head()

Unnamed: 0,y_pred,1,22,1.1,0,3,8.919999999999998,7.000999999999998,66.0,15.0013,1.2,4,0.0,0.0.1,0.0.2,1.0,0.0.3,0.0.4,0.0.5
0,3.041092,3,11,1,1,3,26.78,30.002,70.0,11.0014,0,9,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.200821,2,9,1,1,2,7.98,5.0012,81.0,19.0012,0,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.487379,1,1,1,0,1,7.98,5.0012,87.0,19.0012,0,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3.243681,3,12,1,1,3,22.08,24.0026,65.0,7.0015,0,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.880763,3,10,1,1,3,24.9,26.999,65.0,0.0,0,9,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [62]:
pd.crosstab(
    index=test_y['276'].values,
    columns=np.round(predictions), 
    rownames=['actuals'], 
    colnames=['predictions']
)

predictions,-1.0,0.0,1.0,2.0,3.0,4.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,10,5,3,1,0
2,0,6,7,9,4,0
3,0,9,12,5,1,1
4,0,5,4,5,6,1
5,0,9,9,9,7,2
...,...,...,...,...,...,...
871,0,0,0,0,1,0
878,0,0,0,0,1,0
884,0,0,1,1,0,0
888,0,0,1,0,0,0


In [63]:
from sklearn.metrics import mean_squared_error
test_rmse = mean_squared_error(test_y, test_results["y_pred"])
print(f"Test-rmse: {test_rmse:.2f}")

Test-rmse: 70439.67


# model prediction test -- evaluation.py

In [41]:
# s3://sagemaker-us-east-1-644383320443/
import io
file_key  = "siemens-poc/pipeline-jobs/test/test_x.csv"
s3_client = boto3.client("s3")
response = s3_client.get_object(Bucket=config['s3-bucket']['bucket_name'], Key=file_key)
file_content = response['Body'].read()
X_test = pd.read_csv(io.BytesIO(file_content), encoding='utf-8')
X_test.head(3)

Unnamed: 0,season,hr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,yr,mnth,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed
0,1,22,1,0,3,8.92,7.001,66.0,15.0013,1,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3,11,1,1,3,26.78,30.002,70.0,11.0014,0,9,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,9,1,1,2,7.98,5.0012,81.0,19.0012,0,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [42]:
file_key  = "siemens-poc/pipeline-jobs/test/test_y.csv"
response = s3_client.get_object(Bucket=config['s3-bucket']['bucket_name'], Key=file_key)
file_content = response['Body'].read()
y_test = pd.read_csv(io.BytesIO(file_content), encoding='utf-8')
y_test.head(3)

Unnamed: 0,cnt
0,276
1,145
2,75


In [17]:
model_s3_path = 's3://sagemaker-us-east-1-644383320443/siemens-poc/training-output/xgboost-training-2024-01-15-19-27-29-687/output/model.tar.gz'

In [27]:
import pickle as pkl
import tarfile
import joblib

In [28]:
! pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Stream and Load the Model Directly from S3

In [33]:
import boto3
import io
import tarfile
import tempfile
import xgboost as xgb

bucket_name = config['s3-bucket']['bucket_name']
model_file_key = 'siemens-poc/training-output/xgboost-training-2024-01-15-19-27-29-687/output/model.tar.gz'



s3 = boto3.resource('s3')

model_file_stream = io.BytesIO()
s3.Bucket(bucket_name).download_fileobj(model_file_key, model_file_stream)

# Set the stream position to the start
model_file_stream.seek(0)

# Extract the model file from the tar.gz archive in memory
with tarfile.open(fileobj=model_file_stream, mode="r:gz") as tar:
    # Extract the model file to a temporary file
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        for member in tar.getmembers():
            if member.name == 'xgboost-model':  # Adjust the file name as needed
                tmp_file.write(tar.extractfile(member).read())
                tmp_file.flush()
                break

# Load the XGBoost model from the temporary file
model = xgb.Booster()
model.load_model(tmp_file.name)



In [43]:
# Read test data
# test_x = xgb.DMatrix(pd.read_csv(test_x_path).values)
test_x = xgb.DMatrix(X_test.values)
print(test_x)
# test_y = pd.read_csv(test_y_path).to_numpy()
test_y = y_test.to_numpy()
print(test_y)


<xgboost.core.DMatrix object at 0x7f20897a5b10>
[[276]
 [145]
 [ 75]
 ...
 [ 59]
 [335]
 [ 29]]


In [44]:
# Run predictions
predictions = np.array(model.predict(test_x), dtype=float).squeeze()
print(predictions)


[1.27996016 3.0410924  1.20082128 ... 1.27996016 2.97838068 0.22001582]


In [45]:
# Evaluate predictions
test_results = pd.concat([pd.Series(predictions, name="y_pred", index=X_test.index),X_test,],axis=1,)
test_results.head()


Unnamed: 0,y_pred,season,hr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,yr,mnth,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed
0,1.27996,1,22,1,0,3,8.92,7.001,66.0,15.0013,1,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3.041092,3,11,1,1,3,26.78,30.002,70.0,11.0014,0,9,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.200821,2,9,1,1,2,7.98,5.0012,81.0,19.0012,0,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.487379,1,1,1,0,1,7.98,5.0012,87.0,19.0012,0,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3.243681,3,12,1,1,3,22.08,24.0026,65.0,7.0015,0,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
from sklearn.metrics import mean_squared_error
test_rmse = mean_squared_error(test_y, test_results["y_pred"])
report_dict = {"regression_metric":{"test_rmse":{"value":test_rmse}}}
print(f"Test-rmse: {test_rmse:.2f}")




Test-rmse: 70442.56


In [47]:
report_dict

{'regression_metric': {'test_rmse': {'value': 70442.56395592289}}}

In [None]:
# Save evaluation report
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
with open(f"{output_dir}/evaluation.json", "w") as f:
    f.write(json.dumps(report_dict))

# Save prediction baseline file - we need it later for the model quality monitoring
test_results.to_csv(os.path.join(output_prediction_path, 'prediction_baseline/prediction_baseline.csv'), index=False, header=True)