## Descargar datos

In [2]:
!wget https://gist.githubusercontent.com/trantuyen082001/1fc2f5c0ad1507f40e721e6d18b34138/raw/56c3ca73768ceb74cdf6aa20ee7314c47d6ae08e/heart.csv -O data/original/heart.csv

--2025-10-03 23:51:45--  https://gist.githubusercontent.com/trantuyen082001/1fc2f5c0ad1507f40e721e6d18b34138/raw/56c3ca73768ceb74cdf6aa20ee7314c47d6ae08e/heart.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11019 (11K) [text/plain]
Saving to: ‘data/original/heart.csv’


2025-10-03 23:51:45 (49.4 MB/s) - ‘data/original/heart.csv’ saved [11019/11019]



## Leer datos y reorganizar

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sagemaker.inputs import TrainingInput
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

In [4]:
df = pd.read_csv("data/original/heart.csv")

In [6]:
last_column = df.pop("output")

In [9]:
df.insert(0, "target", last_column)

## Dividir en train y test

In [14]:
train, test = train_test_split(df, test_size=0.3)

In [16]:
train.to_csv("data/train.csv", header=None, index=None)

In [17]:
test.to_csv("data/test.csv", header=None, index=None)

In [18]:
!aws s3 cp data/train.csv s3://sagemaker-data1235/heart/

upload: data/train.csv to s3://sagemaker-data1235/heart/train.csv


In [19]:
!aws s3 cp data/test.csv s3://sagemaker-data1235/heart/

upload: data/test.csv to s3://sagemaker-data1235/heart/test.csv


In [21]:
train_input = TrainingInput("s3://sagemaker-data1235/heart/train.csv", 
                            content_type="text/csv")

In [23]:
test_input = TrainingInput("s3://sagemaker-data1235/heart/test.csv", 
                            content_type="text/csv")

## crear estimador

In [34]:
hyperparameters = {
        "max_depth":"5",
        "objective":"binary:logistic",
        "num_round":"50", 
        "eval_metric": "error"}

In [35]:
container = image_uris.retrieve("xgboost", "us-east-1", "latest")

In [36]:
estimator = Estimator(image_uri=container, role="LabRole", 
                      instance_count=1, instance_type="ml.m5.xlarge",
                     hyperparameters=hyperparameters)

In [37]:
estimator.fit({"train": train_input, "validation": test_input})

INFO:sagemaker:Creating training-job with name: xgboost-2025-10-04-00-59-19-592


2025-10-04 00:59:21 Starting - Starting the training job...
2025-10-04 00:59:53 Downloading - Downloading input data...
2025-10-04 01:00:18 Downloading - Downloading the training image.....[34mArguments: train[0m
[34m[2025-10-04:01:01:07:INFO] Running standalone xgboost training.[0m
[34m[2025-10-04:01:01:07:INFO] File size need to be processed in the node: 0.01mb. Available memory size in the node: 8226.22mb[0m
[34m[2025-10-04:01:01:07:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:01:07] S3DistributionType set as FullyReplicated[0m
[34m[01:01:07] 212x13 matrix with 2756 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2025-10-04:01:01:07:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:01:07] S3DistributionType set as FullyReplicated[0m
[34m[01:01:07] 91x13 matrix with 1183 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,[0m
[34m[01:01:07] src/tree/updater_prune.cc:7

## Realizar deploy

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
deployed  = estimator.deploy(initial_instance_count=1, 
                             instance_type="ml.m5.xlarge",
                             serializer=CSVSerializer(),
                             deserializer=CSVDeserializer())
                             