# Modeling of Health Policy Data
Final dataset file location: https://usd-team1-ads508.s3.us-east-1.amazonaws.com/nhis_with_regional_metrics.csv


In [23]:
import pandas as pd
import numpy as np

import boto3
import io
import os 
import sagemaker
import tarfile 
import time
import xgboost as xgb
from datetime import datetime
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.model import Model 
from sagemaker.transformer import Transformer 
from sklearn.metrics import mean_squared_error, r2_score

# import warnings
# warnings.filterwarnings('ignore')

## Initialize Sagemaker Session

In [24]:
# Initialize sagemaker session
sess = sagemaker.Session() 
bucket = "usd-team1-ads508" 
role = sagemaker.get_execution_role() 
region = boto3.Session().region_name 
sm = boto3.Session().client(service_name="sagemaker", region_name=region) 
s3 = boto3.Session().client(service_name="s3", region_name=region) 
s3_resource = boto3.resource('s3') 
print(f"Sagemaker Session: {sess}") 
print(f"Bucket: {bucket}") 
print(f"Region: {region}") 

Sagemaker Session: <sagemaker.session.Session object at 0x7f3cf1c49450>
Bucket: usd-team1-ads508
Region: us-east-1


# SECTION 1: Load the Dataset from S3

In [25]:
key = "processed-health-data-20250331-083045.csv"
s3 = boto3.client('s3') 
obj = s3.get_object(Bucket=bucket, Key=key) 
df = pd.read_csv(io.BytesIO(obj['Body'].read()))
df.head()

Unnamed: 0,empdysmss3_a,agep_a,sex_a,education_level,evercovd_a_binary,shtcvd191_a_binary,hicov_a_binary,avg_uninsured_rate,avg_obesity_rate,avg_flu_vaccination_rate,...,ind_services,ind_transportation,ind_wholesale,region_1,region_2,region_3,region_4,health_risk_score,age_insurance_interaction,split_type
0,2,36,2,1,0,0,0,0.067532,0.315695,0.495664,...,True,False,False,True,False,False,False,-0.092193,0,train
1,2,61,2,1,0,0,0,0.067532,0.315695,0.495664,...,True,False,False,True,False,False,False,-0.092193,0,train
2,0,73,2,1,0,0,0,0.067532,0.315695,0.495664,...,True,False,False,True,False,False,False,-0.092193,0,test
3,0,80,2,1,1,0,0,0.067532,0.315695,0.495664,...,True,False,False,True,False,False,False,-0.092193,0,train
4,2,27,2,1,0,0,0,0.067532,0.315695,0.495664,...,True,False,False,True,False,False,False,-0.092193,0,train


## SECTION 2: Split into Train/Test Sets

In [26]:
# dataset already contains split_type for training
target = 'empdysmss3_a'
split_col = 'split_type'

X = df[df[split_col] == 'train'].drop(columns=[target, split_col])
y = df[df[split_col] == 'train'][target]

X_test = df[df[split_col] == 'test'].drop(columns=[target, split_col])
y_test = df[df[split_col] == 'test'][target]

# Comebine for SagemMaker CSV format
train_df = pd.concat([y, X], axis = 1)
test_df = pd.concat([y_test, X_test], axis = 1)

In [27]:
train_df.head()

Unnamed: 0,empdysmss3_a,agep_a,sex_a,education_level,evercovd_a_binary,shtcvd191_a_binary,hicov_a_binary,avg_uninsured_rate,avg_obesity_rate,avg_flu_vaccination_rate,...,ind_retail,ind_services,ind_transportation,ind_wholesale,region_1,region_2,region_3,region_4,health_risk_score,age_insurance_interaction
0,2,36,2,1,0,0,0,0.067532,0.315695,0.495664,...,False,True,False,False,True,False,False,False,-0.092193,0
1,2,61,2,1,0,0,0,0.067532,0.315695,0.495664,...,False,True,False,False,True,False,False,False,-0.092193,0
3,0,80,2,1,1,0,0,0.067532,0.315695,0.495664,...,False,True,False,False,True,False,False,False,-0.092193,0
4,2,27,2,1,0,0,0,0.067532,0.315695,0.495664,...,False,True,False,False,True,False,False,False,-0.092193,0
6,0,20,2,2,0,0,0,0.067532,0.315695,0.495664,...,False,True,False,False,True,False,False,False,-0.092193,0


### Save train and test csv files to s3

In [28]:
# Save locally with no header, no index
train_path = "xgb-train.csv"
test_path = "xgb-test.csv" 

train_df.to_csv(train_path, index=False, header=False) 
test_df.to_csv(test_path, index=False, header=False)

# upload to s3
prefix_mod = "modeling"
train_s3_uri = sess.upload_data(train_path, bucket=bucket, key_prefix=f"{prefix_mod}/data")
test_s3_uri = sess.upload_data(test_path, bucket=bucket, key_prefix=f"{prefix_mod}/data")

print(f"Training data uploaded to: {train_s3_uri}") 
print(f"Testing data uploaded to: {test_s3_uri}") 

%store train_s3_uri 
%store test_s3_uri 

Training data uploaded to: s3://usd-team1-ads508/modeling/data/xgb-train.csv
Testing data uploaded to: s3://usd-team1-ads508/modeling/data/xgb-test.csv
Stored 'train_s3_uri' (str)
Stored 'test_s3_uri' (str)


# SECTION 3: Setup and Train XGBoost in SageMaker

## Get building XGBoost Image URI

In [29]:
# Get image URI for XGBoost container 
xgb_image = image_uris.retrieve(
    framework='xgboost',
    region=sess.boto_region_name, 
    version='1.5-1' # latest stable version 
)

print("XGBoost Image URI:", xgb_image) 

%store xgb_image 

XGBoost Image URI: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1
Stored 'xgb_image' (str)


## Create estimator 

In [30]:
# Output path location
output_path = f"s3://{bucket}/{prefix_mod}/output"

# Create estimator 
xgb_estimator = Estimator(
    image_uri=xgb_image, 
    role=sagemaker.get_execution_role(), 
    instance_count=1,
    instance_type='ml.m5.large', 
    output_path=output_path, 
    sagemaker_session=sess
)

# create hyperparameters for regression 
xgb_estimator.set_hyperparameters(
    objective='reg:squarederror',
    num_round=100, 
    max_depth=5,
    eta=0.2, 
    subsample=0.8, 
    colsample_bytree=0.8 
)


## Train the model

In [31]:
# prepare input 
train_input = sagemaker.inputs.TrainingInput(
    s3_data=train_s3_uri, 
    content_type='text/csv'
)

# start training 
xgb_estimator.fit({'train': train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-03-31-20-48-12-402


2025-03-31 20:48:14 Starting - Starting the training job...
2025-03-31 20:48:28 Starting - Preparing the instances for training...
2025-03-31 20:48:51 Downloading - Downloading input data...
2025-03-31 20:49:37 Downloading - Downloading the training image......
2025-03-31 20:50:48 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-03-31 20:50:32.780 ip-10-2-122-148.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-03-31 20:50:32.802 ip-10-2-122-148.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-03-31:20:50:33:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-03-31:20:50:33:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-03-31:20:50:33:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-03-31:20:50:33:INFO] Ru

# SECTION 4: Load and Test Model Locally
Endpoint has not yet been created.  
Could not run CreateModel

## Donload and Extract Model Artifact

In [32]:
model_path = xgb_estimator.model_data 
print("Model artifact S3 path: ", model_path) 

Model artifact S3 path:  s3://usd-team1-ads508/modeling/output/sagemaker-xgboost-2025-03-31-20-48-12-402/output/model.tar.gz


In [33]:
# download .tar.gz file from s3
s3 = boto3.client('s3')
parsed = model_path.replace("s3://", "").split("/", 1) 
bucket = parsed[0]
key = parsed[1]
local_tar_path = "xgboost-model.tar.gz" 

# download
s3.download_file(bucket, key, local_tar_path) 

# extract 
with tarfile.open(local_tar_path) as tar: 
    tar.extractall(path="./xgb_model_local")

INFO:botocore.httpchecksum:Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


## Load Model

In [34]:
booster = xgb.Booster()
booster.load_model("xgb_model_local/xgboost-model") 

## Prepare Test Data

In [35]:
X_test = test_df.drop(columns=[target])

# craete DMatrix for XGBoost
dtest = xgb.DMatrix(X_test)

## Predict and Evaluate

In [36]:
# run predictions 
preds = booster.predict(dtest) 

# actual values
y_true = test_df[target].values 

# evaluate 
rmse = np.sqrt(mean_squared_error(y_true, preds)) 
r2 = r2_score(y_true, preds) 

print(f"RMSE: {rmse:.2f}")
print(f"R2 Score: {r2:.3f}")

RMSE: 38.03
R2 Score: -0.000


# Release Resources