# Load the transformed dataset from Amazon S3

In [5]:
import pandas as pd
import numpy as np
import boto3
from io import StringIO


In [6]:
# Option 1:  Directly with pandas (for CSV)
s3_uri = 's3://predict-calorie-expenditure-kaggle/output_2b93293d-2f30-4bdd-b0f5-87ec0fa4bfb7/part-00000-19e5141a-8ad0-4c46-97c8-caf7c746e247-c000.csv'
try:
    df = pd.read_csv(s3_uri)
    print("Data loaded successfully using pd.read_csv!")
    print(df.head())  # Display the first few rows
except Exception as e:
    print(f"Error loading CSV with pandas: {e}")

Data loaded successfully using pd.read_csv!
   id  Sex  Age  Height  Weight  Duration  Heart_Rate  Body_Temp  Calories
0   0  1.0   36   189.0    82.0      26.0       101.0       41.0     150.0
1   1  0.0   64   163.0    60.0       8.0        85.0       39.7      34.0
2   2  0.0   51   161.0    64.0       7.0        84.0       39.8      29.0
3   3  1.0   20   192.0    90.0      25.0       105.0       40.7     140.0
4   4  0.0   38   166.0    61.0      25.0       102.0       40.6     146.0


# Training the Model

In [7]:
import sagemaker
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: us-east-1


RoleArn: arn:aws:iam::619071335416:role/service-role/AmazonSageMaker-ExecutionRole-20250504T222946


In [8]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.session import TrainingInput

In [9]:
sagemaker.__version__

'2.243.3'

In [11]:
bucket = "predict-calorie-expenditure-kaggle"
prefix = "models"
s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1


In [12]:
xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[
        Rule.sagemaker(rule_configs.create_xgboost_report()),
        ProfilerRule.sagemaker(rule_configs.ProfilerReport())
    ]
)


In [16]:
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = "reg:squarederror",
    num_round = 1000
)

In [17]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://predict-calorie-expenditure-kaggle/train-data/output_39841a78-55e7-47b8-bccb-fbf3200dc311/part-00000-ad2717df-7b54-4948-aab8-dce84ce1b8dd-c000.csv", content_type="csv"
)
validation_input = TrainingInput(
    "s3://predict-calorie-expenditure-kaggle/valid-data/output_7df5f11d-188f-476d-a99b-7df7dcc8f4eb/part-00000-838e7560-0eec-45f8-bc81-1341aa67fa70-c000.csv", content_type="csv"
)

In [18]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2025-05-11 21:21:49 Starting - Starting the training job...
2025-05-11 21:22:11 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport: InProgress
...
2025-05-11 21:22:51 Downloading - Downloading input data...
2025-05-11 21:23:11 Downloading - Downloading the training image......
2025-05-11 21:24:20 Training - Training image download completed. Training in progress...[34m[2025-05-11 21:24:33.646 ip-10-0-247-132.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m