### Training Model with SageMaker

#### 01 - Import processed dataset from RDS and Configure SageMaker

In [1]:
import os
import boto3
import numpy as np
import pandas as pd
from sagemaker.session import Session
from sagemaker.s3 import S3Uploader
from rds import create_rds_engine, get_rds_instance
from env import (
    AWS_PROFILE_NAME,
    RDS_DB_NAME,
    RDS_INSTANCE_NAME,
    RDS_PASSWORD,
    RDS_USER,
    DEFAULT_RDS_DB_TABLE,
    S3_BUCKET_NAME,
    IAM_ROLE_NAME,
)

profile_name = AWS_PROFILE_NAME
master_username = RDS_USER
master_password = RDS_PASSWORD
db_name = RDS_DB_NAME
db_instance_identifier = RDS_INSTANCE_NAME
default_table = DEFAULT_RDS_DB_TABLE
bucket = S3_BUCKET_NAME
role_name = IAM_ROLE_NAME

boto_session = boto3.Session(profile_name=profile_name)
sagemaker_session = Session(boto_session)

rds_client = boto_session.client("rds")
iam_client = boto_session.client("iam")
s3_client = boto_session.client("s3")

subfolder_model: str = "models/hotel-reservations/xgboost"
subfolder_dataset: str = "datasets/hotel-reservations"

key_train: str = "hotel-reservations-train-data-xgboost"
key_test: str = "hotel-reservations-test-data-xgboost"

s3_train_data: str = f"s3://{bucket}/{subfolder_dataset}/train/{key_train}"
s3_test_data: str = f"s3://{bucket}/{subfolder_dataset}/test/{key_test}"

s3_output_location: str = f"s3://{bucket}/{subfolder_model}"

endpoint = get_rds_instance(rds_client, db_instance_identifier)
engine = create_rds_engine(master_username, master_password, endpoint, db_name)

query = f"SELECT * FROM `{default_table}-processed`"

df = pd.read_sql(query, con=engine)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\richa\AppData\Local\sagemaker\sagemaker\config.yaml
RDS Instance founded


#### 02 - Reorder columns to bring label_avg_price_room to first position

In [2]:
df = df.sort_index(axis=1)

reorder_columns = []

reorder_columns.append("label_avg_price_per_room")

for i in df.columns:
    if i != "label_avg_price_per_room":
        reorder_columns.append(i)

df = df[reorder_columns]

df["label_avg_price_per_room"] = df["label_avg_price_per_room"] - 1

#### 03 - Separate dataset for train and test

In [3]:
df_train = df.iloc[0:26000, :]
df_test = df.iloc[26000:, :]

In [4]:
X_test = np.array(df_test.iloc[:, 1:df_test.shape[1]].values).astype(np.float32)
y_test = np.array(df_test.iloc[:, 0].values).astype(np.float32)

#### 04 - Create s3 bucket if not exists 

In [23]:
from s3 import create_s3_bucket_if_not_exists

create_s3_bucket_if_not_exists(s3_client, bucket)

S3 Bucket founded


#### 05 - Save train and test dataset as a csv, open and upload it from S3 Bucket as bytes and delete it after 

In [8]:
df_train.to_csv("train.csv", header=False, index=False)
df_test.to_csv("test.csv", header=False, index=False)

In [25]:
with open("train.csv", "rb") as data:
    try:
        S3Uploader.upload_bytes(
            b=data.read(),
            s3_uri=s3_train_data,
            sagemaker_session=sagemaker_session,
        )
    except Exception as err:
        print(err)

with open("test.csv", "rb") as data:
    try:
        S3Uploader.upload_bytes(
            b=data.read(),
            s3_uri=s3_test_data,
            sagemaker_session=sagemaker_session,
        )
    except Exception as err:
        print(err)

In [9]:
os.remove("train.csv")
os.remove("test.csv")

#### 07 - Create, attach and get role with s3 and sagemaker full access

In [27]:
from iam import create_iam_role, attach_iam_role, get_iam_role

create_iam_role(iam_client, role_name)
attach_iam_role(iam_client, role_name)

role = get_iam_role(iam_client, role_name)

IAM Role already exists
IAM Role attached with sagemaker and s3 full access
IAM Role founded


#### 08 - Set SageMaker XGBoost Container

In [28]:
import sagemaker.image_uris

container = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto_session.region_name, version="1.7-1"
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


#### 09 - Set SageMaker XGBoost Estimator

In [29]:
import sagemaker.estimator

xgboost = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=s3_output_location,
    sagemaker_session=sagemaker_session,
    use_spot_instances=True,
    max_run=3600,
    max_wait=3600,
)

xgboost.set_hyperparameters(
    num_class=3,
    num_round=200,
    objective="multi:softmax",
    eval_metric="mlogloss",
)

train_input = sagemaker.inputs.TrainingInput(
    s3_data=s3_train_data, content_type="csv", s3_data_type="S3Prefix"
)
validation_input = sagemaker.inputs.TrainingInput(
    s3_data=s3_test_data, content_type="csv", s3_data_type="S3Prefix"
)

data_channels = {"train": train_input, "validation": validation_input}

#### 10 - Execute SageMaker XGBoost training

In [30]:
output = xgboost.fit(inputs=data_channels)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-07-10-23-29-00-514


2024-07-10 23:27:21 Starting - Starting the training job...
2024-07-10 23:27:36 Starting - Preparing the instances for training...
2024-07-10 23:28:12 Downloading - Downloading input data...
2024-07-10 23:28:58 Downloading - Downloading the training image......
2024-07-10 23:30:04 Training - Training image download completed. Training in progress...[2024-07-10 23:30:08.777 ip-10-2-77-250.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-07-10 23:30:08.799 ip-10-2-77-250.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2024-07-10:23:30:09:INFO] Imported framework sagemaker_xgboost_container.training
[2024-07-10:23:30:09:INFO] Failed to parse hyperparameter eval_metric value mlogloss to Json.
Returning the value itself
[2024-07-10:23:30:09:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.
Returning the value itself
[2024-07-10:23:30:09:INFO] No GPUs detected (normal if no gpus installed)
[2024-07-10:23:30:0