# Load the transformed dataset from Amazon S3

In [1]:
import pandas as pd
import numpy as np
import boto3
from io import StringIO
import os

# Training the Model

In [2]:
# Load Trainig Data and Preprocess to remove headers and indices
BUCKET_NAME = "predict-calorie-expenditure-kaggle"
TRAIN_DATA_PATH = "train_data/output_2064db69-0062-4360-9101-13be0f765781/part-00000-8470ccb8-53e0-4dc9-9179-25d44fcd1a16-c000.csv"
train_df = pd.read_csv(os.path.join("s3://", BUCKET_NAME, TRAIN_DATA_PATH))


VALID_DATA_PATH = "valid_data/output_2e4b123a-435b-44bb-86fb-9b13d32fc451/part-00000-1be17c75-88ed-42f0-8fed-ae2128471837-c000.csv"
valid_df = pd.read_csv(os.path.join("s3://", BUCKET_NAME, VALID_DATA_PATH))

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [4]:
valid_df.columns

Index(['Calories', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp'],
      dtype='object')

In [7]:
# Now process this dataset to remove header sand save it back
def process_df(df, save_name):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, header=False, index=False)
    # Show first few lines of the raw CSV
    print(csv_buffer.getvalue().splitlines()[:5])
    
    s3 = boto3.client('s3')
    
    s3.put_object(
        Bucket=BUCKET_NAME,
        Key=os.path.join("processed_data", save_name),
        Body=csv_buffer.getvalue()
    )



In [8]:
process_df(train_df, "train_df_no_header.csv")
process_df(valid_df, "valid_df_no_header.csv")

['150.0,1.0,36,189.0,82.0,26.0,101.0,41.0', '34.0,0.0,64,163.0,60.0,8.0,85.0,39.7', '29.0,0.0,51,161.0,64.0,7.0,84.0,39.8', '140.0,1.0,20,192.0,90.0,25.0,105.0,40.7', '146.0,0.0,38,166.0,61.0,25.0,102.0,40.6']
['145.0,1.0,46,188.0,94.0,23.0,100.0,40.8', '28.0,0.0,41,161.0,63.0,6.0,91.0,39.1', '3.0,0.0,51,171.0,71.0,1.0,77.0,37.7', '10.0,1.0,26,198.0,99.0,3.0,89.0,38.6', '16.0,0.0,40,162.0,64.0,4.0,88.0,39.1']


In [9]:
import sagemaker
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
AWS Region: us-east-1
RoleArn: arn:aws:iam::619071335416:role/service-role/AmazonSageMaker-ExecutionRole-20250504T222946


In [10]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.session import TrainingInput

In [11]:
sagemaker.__version__

'2.243.3'

In [12]:
bucket = "predict-calorie-expenditure-kaggle"
prefix = "models"
s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1


In [13]:
xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[
        Rule.sagemaker(rule_configs.create_xgboost_report()),
        ProfilerRule.sagemaker(rule_configs.ProfilerReport())
    ]
)


In [14]:
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = 'reg:squarederror',
    num_round = 1000
)

In [15]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://predict-calorie-expenditure-kaggle/processed_data/train_df_no_header.csv", content_type="csv"
)
validation_input = TrainingInput(
    "s3://predict-calorie-expenditure-kaggle/processed_data/valid_df_no_header.csv", content_type="csv"
)

In [16]:
xgb_model.fit({"train": train_input, "validation": validation_input})

2025-05-17 12:00:54 Starting - Starting the training job...CreateXgboostReport: InProgress
ProfilerReport: InProgress
......
2025-05-17 12:02:16 Starting - Preparing the instances for training...
2025-05-17 12:02:56 Downloading - Downloading input data...
2025-05-17 12:03:17 Downloading - Downloading the training image......
2025-05-17 12:04:28 Training - Training image download completed. Training in progress...[34m[2025-05-17 12:04:38.710 ip-10-2-192-12.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ',