In [9]:
import sagemaker
from datetime import datetime
from sagemaker.session import TrainingInput
from sagemaker.serializers import CSVSerializer

# Training 

In [2]:
BUCKET = "page2sage"

region = sagemaker.Session().boto_region_name # AWS Region
role = sagemaker.get_execution_role() # Rol de AMI para ejecución


In [3]:
container = sagemaker.image_uris.retrieve("xgboost", region, "latest") # https://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/image_uri_config
xgb_model = sagemaker.estimator.Estimator(
    image_uri=container, 
    role=role,
    instance_count=1, # Para Paralelizacion
    instance_type="ml.m5.large", # https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/notebooks-available-instance-types.md
    volume_size=5, # en GB
    output_path=f"s3://{BUCKET}/models/xgboost/", # Donde guardar el modelo entrenado
    sagemaker_session=sagemaker.Session()
)

In [4]:
xgb_model.set_hyperparameters(
    max_depth=5,
    num_round=100
)

In [5]:
train_input = TrainingInput(
    f"s3://{BUCKET}/train/train.csv", # S3 Data location
    content_type="csv", # Data type
    s3_data_type="S3Prefix" # ["S3Prefix", "ManifestFile", "AugmentedManifestFile"]
)

validation_input = TrainingInput(
    f"s3://{BUCKET}/validation/validation.csv",
    content_type="csv",
    s3_data_type="S3Prefix"
)

In [None]:
xgb_model.fit({"train":train_input, "validation":validation_input}, wait=True)

2021-12-02 08:57:52 Starting - Starting the training job...
2021-12-02 08:58:15 Starting - Launching requested ML instancesProfilerReport-1638435472: InProgress
...
2021-12-02 08:58:41 Starting - Preparing the instances for training......

# Online Prediction 

In [12]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1, instance_type="ml.t2.medium", serializer=CSVSerializer())

-------!

In [30]:
import pandas as pd
eval_data = pd.read_csv("../data/eval.csv", header=None)
with open("../data/columns.txt") as f:
    eval_data.columns = f.read().split("\n")
sample = eval_data.sample(1).iloc[0]
print(sample)
print(f"The value for this property is: {10**float(xgb_predictor.predict(sample))}")

OverallQual              8.000000
OverallCond              5.000000
YearBuilt             2004.000000
YearRemodAdd          2005.000000
FullBath                 2.000000
BedroomAbvGr             3.000000
TotRmsAbvGrd             7.000000
Fireplaces               1.000000
LogLotFrontage           2.029384
LogLotArea               4.055455
LogGrLivArea             3.263873
MSZoning_C (all)         0.000000
MSZoning_FV              0.000000
MSZoning_RH              0.000000
MSZoning_RL              1.000000
MSZoning_RM              0.000000
RoofStyle_Flat           0.000000
RoofStyle_Gable          1.000000
RoofStyle_Gambrel        0.000000
RoofStyle_Hip            0.000000
RoofStyle_Mansard        0.000000
RoofStyle_Shed           0.000000
GarageType_2Types        0.000000
GarageType_Attchd        1.000000
GarageType_Basment       0.000000
GarageType_BuiltIn       0.000000
GarageType_CarPort       0.000000
GarageType_Detchd        0.000000
GarageType_None          0.000000
HouseStyle_1.5

In [31]:
sample["YearRemodAdd"] = 2020
print(sample)
print(f"The value for this property is: {10**float(xgb_predictor.predict(sample))}")

OverallQual              8.000000
OverallCond              5.000000
YearBuilt             2004.000000
YearRemodAdd          2020.000000
FullBath                 2.000000
BedroomAbvGr             3.000000
TotRmsAbvGrd             7.000000
Fireplaces               1.000000
LogLotFrontage           2.029384
LogLotArea               4.055455
LogGrLivArea             3.263873
MSZoning_C (all)         0.000000
MSZoning_FV              0.000000
MSZoning_RH              0.000000
MSZoning_RL              1.000000
MSZoning_RM              0.000000
RoofStyle_Flat           0.000000
RoofStyle_Gable          1.000000
RoofStyle_Gambrel        0.000000
RoofStyle_Hip            0.000000
RoofStyle_Mansard        0.000000
RoofStyle_Shed           0.000000
GarageType_2Types        0.000000
GarageType_Attchd        1.000000
GarageType_Basment       0.000000
GarageType_BuiltIn       0.000000
GarageType_CarPort       0.000000
GarageType_Detchd        0.000000
GarageType_None          0.000000
HouseStyle_1.5

In [27]:
xgb_predictor.endpoint_name

'xgboost-2021-12-02-21-53-09-236'

In [66]:
from sagemaker.predictor import Predictor
ser = CSVSerializer()
predictor = Predictor(xgb_predictor.endpoint_name, sagemaker_session=sagemaker.Session(), serializer=CSVSerializer())
10**float(predictor.predict(ser.serialize(sample)))

296577.6399058261

In [69]:
xgb_predictor.delete_endpoint()

ClientError: An error occurred (ValidationException) when calling the DescribeEndpointConfig operation: Could not find endpoint configuration "arn:aws:sagemaker:us-east-2:317987917227:endpoint-config/xgboost-2021-12-02-21-53-09-236".

# Batch Transform

In [70]:
transformer = xgb_model.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path="s3://page2sage/predicted"
)

In [71]:
transformer.transform(
    data="s3://page2sage/eval/",
    data_type="S3Prefix",
    content_type="text/csv"
)

.......................[34mArguments: serve[0m
[34m[2021-12-02 22:28:09 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-12-02 22:28:09 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-12-02 22:28:09 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-12-02 22:28:09 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-12-02 22:28:09 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-12-02:22:28:09:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-12-02:22:28:09:INFO] Model loaded successfully for worker : 22[0m

[34m[2021-12-02:22:28:13:INFO] Sniff delimiter as ','[0m
[34m[2021-12-02:22:28:13:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-12-02:22:28:13:ERROR] Feature size of csv inference data 38 is not consistent with feature size of trained model 37[0m
[34mTraceback (most recent call last):
  File "/opt/amazon/lib/p

UnexpectedStatusException: Error for Transform job xgboost-2021-12-02-22-24-27-891: Failed. Reason: ClientError: See job logs for more information

In [None]:
transformer.wait()

In [None]:

!aws s3 ls s3://sage2page/predicted/ 

In [None]:
!aws s3 cp s3://sage2page/predicted/ ../data/predicted.csv
!head ../data/predicted.csv