# imports 

In [1]:
from pathlib import Path
import xgboost as xgb
import pandas as pd
import numpy as np
import sagemaker
import botocore
import joblib
import boto3
import yaml
import json
import sys
import os
from time import gmtime, strftime, sleep
from sagemaker.experiments.run import Run, load_run

sagemaker.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


'2.219.0'

In [2]:
# import config
with open(os.path.abspath(os.path.join(os.pardir,"config.yml")),"r") as f:
    config = yaml.load(f,Loader=yaml.FullLoader)
print(config)

{'features': ['dteday', 'season', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'yr', 'mnth'], 'holiday_mappings': {'No': 1, 'Yes': 0}, 'hr_mappings': {'10am': 9, '10pm': 8, '11am': 11, '11pm': 7, '12am': 5, '12pm': 17, '1am': 4, '1pm': 16, '2am': 3, '2pm': 15, '3am': 1, '3pm': 18, '4am': 0, '4pm': 19, '5am': 2, '5pm': 23, '6am': 6, '6pm': 22, '7am': 12, '7pm': 20, '8am': 21, '8pm': 14, '9am': 13, '9pm': 10}, 'mnth_mappings': {'April': 5, 'August': 11, 'December': 2, 'February': 1, 'January': 0, 'July': 10, 'June': 9, 'March': 3, 'May': 7, 'November': 4, 'October': 6, 'September': 8}, 's3-bucket': {'bucket_name': 'sagemaker-us-east-1-644383320443', 'bucket_prefix': 'siemens-poc/', 'training_file_key': 'siemens-poc/bike-sharing-dataset.csv'}, 'sagemaker': {'domain_id': 'd-zjfao8azi0ng', 'region': 'us-east-1', 'role': 'arn:aws:iam::644383320443:role/service-role/AmazonSageMaker-ExecutionRole-20240109T220483'}, 'season

In [3]:
config['sagemaker']['domain_id']

'd-zjfao8azi0ng'

In [9]:
boto_session = boto3.Session()
region = boto_session.region_name
print(f"Region--{region}")
sm_role = sagemaker.get_execution_role()
print(f"Sagemaker Role--{sm_role}")
sagemaker_session = sagemaker.Session()
default_bucket = sagemaker_session.default_bucket()
print(f"Default bucket--{default_bucket}")

Region--eu-north-1
Sagemaker Role--arn:aws:iam::058264393695:role/service-role/AmazonSageMaker-ExecutionRole-20240621T001438
Default bucket--sagemaker-eu-north-1-058264393695


# get data

In [10]:
import io
s3_client = boto3.client("s3")
# file_key  = config['s3-bucket']['training_file_key']
file_key  = 'bike-sharing-dataset.csv'
print(file_key)
# response = s3_client.get_object(Bucket=config['s3-bucket']['bucket_name'], Key=file_key)
response = s3_client.get_object(Bucket=default_bucket, Key=file_key)

file_content = response['Body'].read()
df = pd.read_csv(io.BytesIO(file_content), encoding='utf-8')
df.head(3)

bike-sharing-dataset.csv


Unnamed: 0,dteday,season,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,2012-11-05,winter,6am,No,Mon,Yes,Mist,6.1,3.0014,49.0,19.0012,4,135,139
1,2011-07-13,fall,4am,No,Wed,Yes,Clear,26.78,28.9988,58.0,16.9979,0,5,5
2,2012-02-09,spring,11am,No,Thu,Yes,Clear,3.28,-0.9982,52.0,15.0013,4,95,99


In [12]:
import sys
root_dir = Path('../scripts').resolve().parents[0] 
print(root_dir)
sys.path.append(str(root_dir))


/home/sagemaker-user/bits-webminar-june-24/ml


# feature engineering

In [13]:
import scripts.features as features
processed_data = features.pre_process(df)


{'features': ['dteday', 'season', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'yr', 'mnth'], 'holiday_mappings': {'No': 1, 'Yes': 0}, 'hr_mappings': {'10am': 9, '10pm': 8, '11am': 11, '11pm': 7, '12am': 5, '12pm': 17, '1am': 4, '1pm': 16, '2am': 3, '2pm': 15, '3am': 1, '3pm': 18, '4am': 0, '4pm': 19, '5am': 2, '5pm': 23, '6am': 6, '6pm': 22, '7am': 12, '7pm': 20, '8am': 21, '8pm': 14, '9am': 13, '9pm': 10}, 'mnth_mappings': {'April': 5, 'August': 11, 'December': 2, 'February': 1, 'January': 0, 'July': 10, 'June': 9, 'March': 3, 'May': 7, 'November': 4, 'October': 6, 'September': 8}, 's3-bucket': {'bucket_name': 'sagemaker-us-east-1-644383320443', 'bucket_prefix': 'siemens-poc/', 'training_file_key': 'siemens-poc/bike-sharing-dataset.csv'}, 'sagemaker': {'domain_id': 'd-zjfao8azi0ng', 'region': 'us-east-1', 'role': 'arn:aws:iam::644383320443:role/service-role/AmazonSageMaker-ExecutionRole-20240109T220483'}, 'season

In [14]:
processed_data

Unnamed: 0,season,hr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,yr,mnth,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed
0,1,6,1,1,2,6.10,3.0014,49.0,19.0012,139,1,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,3,0,1,1,3,26.78,28.9988,58.0,16.9979,5,0,10,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,11,1,1,3,3.28,-0.9982,52.0,15.0013,99,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2,12,1,1,2,14.56,15.0002,100.0,6.0032,361,1,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,17,1,1,3,16.44,17.0000,52.0,8.9981,203,0,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,2,16,1,1,3,28.66,30.0020,46.0,16.9979,190,0,7,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17375,2,10,1,1,3,16.44,17.0000,55.0,7.0015,267,1,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17376,0,7,1,1,3,7.04,7.0010,76.0,0.0000,99,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17377,0,4,1,0,2,-6.12,-16.0000,41.0,26.0027,12,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# create a sagemaker experiment 

In [15]:
from datetime import datetime, timedelta
def utc_to_ist(utc_dt):
    return utc_dt + timedelta(hours=5, minutes=30)

def get_current_ist():
    current_utc_time = datetime.utcnow()
    current_ist_time = utc_to_ist(current_utc_time)
    return current_ist_time

In [16]:
current_ist_time = get_current_ist()
experiment_name = current_ist_time.strftime("bits-webminar-poc-%d-%m-%Y-%H-%M-%S")
experiment_name

'bits-webminar-poc-21-06-2024-12-24-33'

# split data --once

In [17]:
# Shuffle and splitting dataset
train_data, validation_data, test_data = np.split(
    processed_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(processed_data)), int(0.9 * len(processed_data))],
)

print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

Data split > train:(12165, 19) | validation:(3476, 19) | test:(1738, 19)


  return bound(*args, **kwds)


In [63]:
def upload_file_to_s3(df,file_key):
    # Convert DF to csv buffer
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer)
    # S3 bucket details
    # BUCKET_NAME = config['s3-bucket']['bucket_name']
    BUCKET_NAME  = default_bucket
    # PREFIX = config['s3-bucket']['bucket_prefix'] 
    PREFIX = 'bits-webminar-june-24/'
    KEY = f"{PREFIX}{file_key}"
    try:
        s3_client.put_object(
            Body=csv_buffer.getvalue(), 
            Bucket = BUCKET_NAME, 
            Key = KEY,
        )
        print("Upload Successful")
    except FileNotFoundError:  
        print("The file was not found")
    except botocore.exceptions.ClientError as error:
        if error.response['Error']['Code'] == "404":
            print("The object does not exist")
        else:
            raise

In [64]:
upload_file_to_s3(train_data,'train_data.csv')
upload_file_to_s3(validation_data,'validation_data.csv')
upload_file_to_s3(test_data,'test_data.csv')

Upload Successful
Upload Successful
Upload Successful


# get data from s3 

In [65]:
def get_data_from_s3_as_df(file_key):
    BUCKET_NAME  = default_bucket
    # BUCKET_NAME = config['s3-bucket']['bucket_name']
    # PREFIX = config['s3-bucket']['bucket_prefix']
    PREFIX = 'bits-webminar-june-24/'
    KEY = f'{PREFIX}{file_key}'
    print(f"downloadig..{KEY}")
    response = s3_client.get_object(Bucket=BUCKET_NAME, Key=KEY)
    file_content = response['Body'].read()
    df = pd.read_csv(io.BytesIO(file_content), encoding='utf-8')
    return df 

In [66]:
train_data = get_data_from_s3_as_df('train_data.csv')
test_data  = get_data_from_s3_as_df('test_data.csv')
validation_data = get_data_from_s3_as_df('validation_data.csv')

downloadig..bits-webminar-june-24/train_data.csv
downloadig..bits-webminar-june-24/test_data.csv
downloadig..bits-webminar-june-24/validation_data.csv


In [67]:
train_data.drop(['Unnamed: 0'],axis=1 , inplace=True)
test_data.drop(['Unnamed: 0'],axis=1 , inplace=True)
validation_data.drop(['Unnamed: 0'],axis=1 , inplace=True)

# model training and validation

In [68]:
target_col = config['target']
target_col

'cnt'

In [69]:
train_features = train_data.drop(target_col, axis=1)
train_label = pd.DataFrame(train_data[target_col])

In [70]:
dtrain = xgb.DMatrix(train_features, label=train_label)

In [71]:
# Set hyperparameters for regression
hyperparams = {
    "max_depth": 5,
    "eta": 0.5,
    "alpha": 2.5,
    "objective": "reg:squarederror",  # objective regression
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 3
}

num_boost_round = 150
nfold = 3
early_stopping_rounds = 10

In [52]:
# Cross-validate on training data
cv_results = xgb.cv(
    params=hyperparams,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    nfold=nfold,
    early_stopping_rounds=early_stopping_rounds,
    metrics=["rmse"],  #  regression metrics RMSE
    seed=10,
)

In [53]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,154.344041,1.176970,154.668489,3.799928
1,109.072339,5.441027,109.575311,1.596657
2,93.738298,6.816883,95.113125,3.337431
3,83.455776,11.663214,84.847708,9.981915
4,68.188056,8.926365,70.113222,7.187370
...,...,...,...,...
86,31.068865,1.758535,49.006540,1.182318
87,30.953048,1.759450,49.011902,1.231001
88,30.827803,1.749460,49.002275,1.225369
89,30.735567,1.746445,49.023262,1.177938


In [54]:
# metrics data
metrics_data = {
    "regression_metrics": {
        "validation:rmse": {
            "value": cv_results.iloc[-1]["test-rmse-mean"],
            "standard_deviation": cv_results.iloc[-1]["test-rmse-std"]
        },
        "train:rmse": {
"value": cv_results.iloc[-1]["train-rmse-mean"],
"standard_deviation": cv_results.iloc[-1]["train-rmse-std"]
        },
    }
}

In [55]:
print(f"Cross-validated train-rmse:{cv_results.iloc[-1]['train-rmse-mean']:.2f}")
print(f"Cross-validated validation-rmse:{cv_results.iloc[-1]['test-rmse-mean']:.2f}")

Cross-validated train-rmse:30.64
Cross-validated validation-rmse:48.98


# training model on whole dataset

In [56]:
test_features = test_data.drop(target_col, axis=1)
test_label = pd.DataFrame(test_data[target_col])
dtest = xgb.DMatrix(test_features, label=test_label)

# create a Run

In [57]:
current_ist_time = get_current_ist()
current_ist_time

datetime.datetime(2024, 6, 21, 12, 32, 0, 522798)

In [58]:
run_suffix = current_ist_time.strftime("%d-%m-%Y-%H-%M-%S")
run_suffix

'21-06-2024-12-32-00'

In [59]:
sagemaker_session = sagemaker.Session()
sm_client = sagemaker_session.sagemaker_client

In [73]:
# get file paths from s3 to log in the run 

objects = s3_client.list_objects_v2(Bucket=default_bucket)
for obj in objects.get('Contents', []):
    # print(obj)
    file_key = obj['Key']
    # print(file_key)
    if file_key == "bits-webminar-june-24/":
        continue
    else:
        s3_file_path = f"s3://{default_bucket}/{file_key}"
        print(s3_file_path)


s3://sagemaker-eu-north-1-058264393695/bike-sharing-dataset.csv
s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/test_data.csv
s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/train_data.csv
s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/validation_data.csv


In [74]:

with Run(experiment_name=experiment_name,
         run_name=f"feature-engineering-{run_suffix}",
         run_display_name="feature-engineering",
         sagemaker_session=sagemaker_session) as run:
    run.log_parameters(
        {
            "train": 0.7,
            "validate": 0.2,
            "test": 0.1
        }
    )
    # Log input dataset metadata and output
    # Log files from S3
    run.log_artifact(name='bike-share-dataset',value='s3://sagemaker-eu-north-1-058264393695/bike-sharing-dataset.csv', media_type="text/csv")
    run.log_artifact(name= 'train-csv',value='s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/train_data.csv', media_type="text/csv")
    run.log_artifact(name = 'test-csv',value='s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/test_data.csv', media_type="text/csv")
    run.log_artifact(name = 'validation-csv',value='s3://sagemaker-eu-north-1-058264393695/bits-webminar-june-24/validation_data.csv', media_type="text/plain")


INFO:sagemaker.experiments.run:The run (feature-engineering-21-06-2024-12-32-00) under experiment (bits-webminar-poc-21-06-2024-12-24-33) already exists. Loading it.


In [75]:
from sklearn.metrics import mean_squared_error

print(f"run_suffix--{run_suffix}")

# Train the model for different max_depth values
for i, d in enumerate([2, 5, 10, 15, 20]):
    hyperparams["max_depth"] = d
    
    print(f"Fit estimator with max_depth={d}")
    run_name = f"training-{i}-{run_suffix}"
    
    with Run(experiment_name=experiment_name,
             run_name=run_name,
             run_display_name=f"max-depth-{d}",
             sagemaker_session=sagemaker_session) as run:
        # Train the model
        model = (
            xgb.train(
                params=hyperparams, 
                dtrain=dtrain, 
                evals = [(dtrain,'train'), (dtest,'eval')], 
                num_boost_round=num_boost_round, 
                early_stopping_rounds=early_stopping_rounds, 
                verbose_eval = 0
            )
        )
        # Calculate regression metrics 
        test_rmse = np.sqrt(mean_squared_error(test_label, model.predict(dtest)))
        train_rmse = np.sqrt(mean_squared_error(train_label, model.predict(dtrain)))
        
        # Log metrics to the run
        run.log_parameters(hyperparams)
        run.log_metric(name="test_rmse", value = test_rmse, step=d)
        run.log_metric(name="train_rmse", value = train_rmse, step=d)

        # time.sleep(8) # wait until resource tags are propagated to the run
        print(f"Test RMSE: {test_rmse:.4f} | Train RMSE: {train_rmse:.4f}")
        

run_suffix--21-06-2024-12-32-00
Fit estimator with max_depth=2
Test RMSE: 69.1830 | Train RMSE: 65.7933
Fit estimator with max_depth=5
Test RMSE: 44.9638 | Train RMSE: 28.4505
Fit estimator with max_depth=10
Test RMSE: 46.7829 | Train RMSE: 17.6668
Fit estimator with max_depth=15
Test RMSE: 47.7271 | Train RMSE: 4.2208
Fit estimator with max_depth=20
Test RMSE: 48.3507 | Train RMSE: 3.8493
