# Model training
This notebook contains the model training and hyperparameter tuning of the LightGBM model using Sagemaker

In [54]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.parameter import ContinuousParameter, CategoricalParameter, IntegerParameter
from sagemaker.tuner import HyperparameterTuner
from sagemaker import image_uris, model_uris, script_uris
import time
import boto3


In [30]:
role = sagemaker.get_execution_role() 
session = sagemaker.Session() 
region = session.boto_region_name
bucket = session.default_bucket()

column_names = ['offer_completed_after_view', 'age', 'income', 'membership_days', 'gender_F', 'gender_M', 'gender_O', 'reward', 'difficulty', 'duration', 'email', 'mobile', 'social', 'web', 'offer_bogo', 'offer_discount', 'offer_informational']
train_location, val_location, test_location = ('s3://sagemaker-us-east-1-256735873794/data/train.csv', 's3://sagemaker-us-east-1-256735873794/data/validation.csv', 's3://sagemaker-us-east-1-256735873794/data/test.csv')
train_model_id, train_model_version, train_scope = "lightgbm-classification-model", "*", "training"
training_instance_type = "ml.m5.xlarge"

prefix = 'sagemaker-project'
s3_output_location = f's3://{bucket}/{prefix}/output'

In [31]:
# Retrieve the docker image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type=training_instance_type,
)
# Retrieve the training script
train_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)
# Retrieve the pre-trained model tarball to further fine-tune
train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)

In [48]:
# Retrieve the default hyper-parameters for fine-tuning the model
hyperparameters = sagemaker.hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)
hyperparameters["metric"] = "average_precision"
hyperparameters

{'num_boost_round': '5000',
 'early_stopping_rounds': '30',
 'metric': 'average_precision',
 'learning_rate': '0.009',
 'num_leaves': '67',
 'feature_fraction': '0.74',
 'bagging_fraction': '0.53',
 'bagging_freq': '5',
 'max_depth': '11',
 'min_data_in_leaf': '26',
 'max_delta_step': '0.0',
 'lambda_l1': '0.0',
 'lambda_l2': '0.0',
 'boosting': 'gbdt',
 'min_gain_to_split': '0.0',
 'scale_pos_weight': '1.0',
 'tree_learner': 'serial',
 'feature_fraction_bynode': '1.0',
 'is_unbalance': 'False',
 'max_bin': '255',
 'num_threads': '0',
 'verbosity': '1',
 'use_dask': 'False'}

In [69]:
hyperparameter_ranges_lgb = {
    'learning_rate': ContinuousParameter(0.01, 0.2), # Learning rate (Step size shrinkage for updates)
    "num_leaves": IntegerParameter(2, 50), # Maximum number of leaves in a tree
    'feature_fraction': ContinuousParameter(0.5, 1), # Fraction of features used (column sample by tree)
    'bagging_fraction': ContinuousParameter(0.5, 1), # Fraction of data used for bagging (subsample)
    "bagging_freq": IntegerParameter(1, 10),
    "max_depth": IntegerParameter(1, 10), # Maximum tree depth, -1 for no constraints
    "min_data_in_leaf": IntegerParameter(1, 30), # Minimum number of data points in a leaf
    'extra_trees': CategoricalParameter([True, False]) # If True when evaluating node splits LightGBM will check only one randomly-chosen threshold for each feature
}

In [70]:
# Create SageMaker Estimator instance
tabular_estimator = Estimator(
    role=role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py", # Default one available in image uri
    instance_count=1,
    volume_size=30,
    instance_type=training_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_location,
)

In [None]:
# Set up the hyperparameter tuner
tuner = HyperparameterTuner(tabular_estimator,
        objective_metric_name='average_precision', # See https://docs.aws.amazon.com/sagemaker/latest/dg/lightgbm-tuning.html
        hyperparameter_ranges=hyperparameter_ranges_lgb,
        metric_definitions=[{"Name": "average_precision", "Regex": "average_precision: ([0-9\\.]+)"}],
        strategy='Bayesian', # Bayesian optimization
        objective_type="Maximize",
        max_jobs=20,
        max_parallel_jobs=3
)

# Start the hyperparameter tuning job
training_job_name = prefix + str(int(time.time()))
tuner.fit({'train': train_location, 'validation': val_location}, logs=True, job_name=training_job_name)
tuner.wait()

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................................................................................

In [76]:
tuner.best_estimator(), tuner.best_estimator().hyperparameters()


2023-06-10 22:35:14 Starting - Found matching resource for reuse
2023-06-10 22:35:14 Downloading - Downloading input data
2023-06-10 22:35:14 Training - Training image download completed. Training in progress.
2023-06-10 22:35:14 Uploading - Uploading generated training model
2023-06-10 22:35:14 Completed - Resource retained for reuse

2023-06-10 22:35:14 Starting - Found matching resource for reuse
2023-06-10 22:35:14 Downloading - Downloading input data
2023-06-10 22:35:14 Training - Training image download completed. Training in progress.
2023-06-10 22:35:14 Uploading - Uploading generated training model
2023-06-10 22:35:14 Completed - Resource retained for reuse


(<sagemaker.estimator.Estimator at 0x7f060bdd5590>,
 {'_tuning_objective_metric': 'average_precision',
  'bagging_fraction': '1.0',
  'bagging_freq': '7',
  'boosting': '"gbdt"',
  'early_stopping_rounds': '"30"',
  'extra_trees': 'True',
  'feature_fraction': '0.9968771503188556',
  'feature_fraction_bynode': '"1.0"',
  'is_unbalance': '"False"',
  'lambda_l1': '"0.0"',
  'lambda_l2': '"0.0"',
  'learning_rate': '0.10899245357673483',
  'max_bin': '"255"',
  'max_delta_step': '"0.0"',
  'max_depth': '7',
  'metric': '"average_precision"',
  'min_data_in_leaf': '2',
  'min_gain_to_split': '"0.0"',
  'num_boost_round': '"5000"',
  'num_leaves': '50',
  'num_threads': '"0"',
  'sagemaker_container_log_level': '20',
  'sagemaker_job_name': '"sagemaker-project1686435990"',
  'sagemaker_program': '"transfer_learning.py"',
  'sagemaker_region': '"us-east-1"',
  'sagemaker_submit_directory': '"s3://jumpstart-cache-prod-us-east-1/source-directory-tarballs/lightgbm/transfer_learning/classificat

In [79]:
import boto3
import pandas as pd

# Function to fetch tuner results as a DataFrame
def tuner_results_to_dataframe(tuner):
    sagemaker_client = boto3.client('sagemaker', region_name=region)
    all_jobs_response = sagemaker_client.list_training_jobs_for_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=tuner.latest_tuning_job.name
    )
    
    all_jobs = all_jobs_response['TrainingJobSummaries']
    
    job_details = []
    for job in all_jobs:
        job_details_response = sagemaker_client.describe_training_job(TrainingJobName=job['TrainingJobName'])
        job_details.append(job_details_response)
    
    df = pd.DataFrame(job_details)
    return df

# Fetch tuner results
tuner_df = tuner_results_to_dataframe(tuner)

# Get best training job details
best_job_name = tuner.best_training_job()
best_job = tuner_df.loc[tuner_df['TrainingJobName'] == best_job_name]

# Get the final metric value
best_job_objective_value = best_job.iloc[0]['FinalMetricDataList'][0]['Value']
best_job_hyperparameters = best_job.iloc[0]['HyperParameters']

print("Best training job:", best_job_name)
print("\nBest hyperparameters:\n", best_job_hyperparameters)
print("\nBest objective value:", best_job_objective_value)

# Expand hyperparameters into columns and keep relevant columns
hyperparameters_expanded = tuner_df['HyperParameters'].apply(pd.Series)
tuner_df_final = pd.concat([tuner_df['TrainingJobName'], hyperparameters_expanded, tuner_df['FinalMetricDataList']], axis=1)

# Extract the objective value from FinalMetricDataList
tuner_df_final['ObjectiveValue'] = tuner_df_final['FinalMetricDataList'].apply(lambda x: x[0]['Value'])

# Calculate time of training and rank by the objective value
tuner_df_final['TrainingTime'] = tuner_df['TrainingEndTime'] - tuner_df['TrainingStartTime']
tuner_df_final = tuner_df_final.sort_values("ObjectiveValue", ascending=False)
tuner_df_final.insert(len(tuner_df_final.columns), 'Rank', range(1, 1+len(tuner_df_final)))

print("\nAll training jobs with expanded hyperparameters, metric value, time, and ranking:\n")
relevant_columns = ['TrainingJobName', 'Rank', 'ObjectiveValue', 'TrainingTime'] + list(hyperparameter_ranges_lgb.keys())

tuner_df_final[relevant_columns].rename(columns={'ObjectiveValue': 'average_precision'})

Best training job: sagemaker-project1686435990-020-6351a9f5

Best hyperparameters:
 {'_tuning_objective_metric': 'average_precision', 'bagging_fraction': '1.0', 'bagging_freq': '7', 'boosting': '"gbdt"', 'early_stopping_rounds': '"30"', 'extra_trees': 'True', 'feature_fraction': '0.9968771503188556', 'feature_fraction_bynode': '"1.0"', 'is_unbalance': '"False"', 'lambda_l1': '"0.0"', 'lambda_l2': '"0.0"', 'learning_rate': '0.10899245357673483', 'max_bin': '"255"', 'max_delta_step': '"0.0"', 'max_depth': '7', 'metric': '"average_precision"', 'min_data_in_leaf': '2', 'min_gain_to_split': '"0.0"', 'num_boost_round': '"5000"', 'num_leaves': '50', 'num_threads': '"0"', 'sagemaker_container_log_level': '20', 'sagemaker_job_name': '"sagemaker-project1686435990"', 'sagemaker_program': '"transfer_learning.py"', 'sagemaker_region': '"us-east-1"', 'sagemaker_submit_directory': '"s3://jumpstart-cache-prod-us-east-1/source-directory-tarballs/lightgbm/transfer_learning/classification/v2.1.1/sourcedi

Unnamed: 0,TrainingJobName,Rank,average_precision,TrainingTime,learning_rate,num_leaves,feature_fraction,bagging_fraction,bagging_freq,max_depth,min_data_in_leaf,extra_trees
0,sagemaker-project1686435990-020-6351a9f5,1,0.666689,0 days 00:00:31.483000,0.1089924535767348,50,0.9968771503188556,1.0,7,7,2,True
5,sagemaker-project1686435990-015-473bb1eb,2,0.659316,0 days 00:01:21.710000,0.1237928834167027,50,0.5,0.507864004191913,7,9,27,True
7,sagemaker-project1686435990-013-64d362ca,3,0.657683,0 days 00:00:31.400000,0.0394238962528188,48,0.9979159734185477,0.9112738109037756,7,6,7,True
6,sagemaker-project1686435990-014-fc6a3618,4,0.657265,0 days 00:01:05.689000,0.0332669239458591,37,0.9978344046226648,0.9845435762796856,7,6,26,True
8,sagemaker-project1686435990-012-80bfeff5,5,0.651613,0 days 00:00:31.570000,0.0195315924877471,24,0.9966036393692054,0.845816976727964,7,9,26,True
3,sagemaker-project1686435990-017-f940ce51,6,0.649897,0 days 00:00:31.526000,0.0854556008405103,47,0.959993134375424,0.8422439782693089,6,4,6,True
2,sagemaker-project1686435990-018-d34cbdbe,7,0.640654,0 days 00:00:31.496000,0.086310790880164,18,0.8882788620221755,0.5322933996959476,7,7,30,True
9,sagemaker-project1686435990-011-d9d3d2b1,8,0.638438,0 days 00:00:31.409000,0.0122539072324035,22,0.8073935479228607,0.5,2,5,4,True
1,sagemaker-project1686435990-019-103cfcca,9,0.631317,0 days 00:00:31.612000,0.050208816505191,11,0.9728568813701572,0.5042905398962911,7,7,15,True
4,sagemaker-project1686435990-016-14f6f8cf,10,0.609758,0 days 00:00:31.507000,0.0515752902183996,50,0.9273525495365575,0.9553497745604602,6,1,28,True


Note: The depth and number of leaves of the first configuration looks reasonable in comparison with the others, and the average precision is significantly higher.

In [83]:
best_job['OutputDataConfig'].iloc[0]

{'KmsKeyId': '',
 'S3OutputPath': 's3://sagemaker-us-east-1-256735873794/sagemaker-project/output'}

In [None]:
# Best job model is found here
# s3://sagemaker-us-east-1-256735873794/sagemaker-project/output/sagemaker-project1686435990-020-6351a9f5/output/model.tar.gz

In [None]:
# # Deploy the best model from the hyperparameter tuning job
# best_model = tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')