### Split the dataset into train, test and validation. 
- train set: train the model
- validation set: hyperparameter tuning
- test set: test the model performance 

In [65]:
sales_df = pd.read_csv('data/wish_sales_explore.csv')
sales_df.head(3)

Unnamed: 0,price,units_sold,rating,rating_count,badges_count,badge_product_quality,product_variation_inventory,merchant_rating_count,merchant_rating,merchant_has_profile_picture,...,rating_one_count,size_m,size_other,size_s,size_xs,log_units_sold,tag_summer,tag_women's fashion,tag_sexy,tag_tank
0,16.0,100,3.76,54,0,0,50,568,4.128521,0,...,9.0,1,0,0,0,4.60517,1,1,0,0
1,8.0,20000,3.45,6135,0,0,50,17752,3.899673,0,...,1077.0,0,0,0,1,9.903488,1,1,1,0
2,8.0,100,3.57,14,0,0,1,295,3.989831,0,...,3.0,0,0,0,1,4.60517,1,1,1,0


In [75]:
y = sales_df["log_units_sold"]
X = sales_df.drop(["units_sold","log_units_sold"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [76]:
scaler = MinMaxScaler()
num_col =[ 'price', 'rating', 'rating_count', 'badges_count',
       'product_variation_inventory','merchant_rating_count', 'merchant_rating','rating_five_count',
       'rating_four_count', 'rating_three_count', 'rating_two_count',
       'rating_one_count']
scaler.fit(X_train[num_col]) #fit the min_max scalar on the train dataset 

def minmax_on_dataset(scaler, df, num_col):
    num_scale = scaler.transform(df[num_col])

    num_scale_df = pd.DataFrame(num_scale, columns = num_col)
    cat_df = df.drop(num_col, axis=1)

    num_scale_df.reset_index(drop=True,inplace=True)
    cat_df.reset_index(drop=True,inplace=True)

    result_df = pd.concat([num_scale_df, cat_df], axis=1)
    return result_df

X_train_prep = minmax_on_dataset(scaler, X_train, num_col)
X_test_prep = minmax_on_dataset(scaler, X_test, num_col)
X_val_prep = minmax_on_dataset(scaler, X_val, num_col)

In [94]:
#save the splitted datasets to local
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
pd.DataFrame(X_test_prep).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

y_val.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
X_val_prep.reset_index(drop=True,inplace=True)
X_train_prep.reset_index(drop=True,inplace=True)

pd.concat([y_val, X_val_prep], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([y_train, X_train_prep], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [69]:
X_train_prep = X_val_prep = y_train = y_val = None

### Upload training/valdiation data files to S3 bucket

In [96]:
session = sagemaker.Session() # Store the current SageMaker session
role = get_execution_role()

prefix = 'wish-xgboost'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

### Create a tuned XGBoost model

In [103]:
container = get_image_uri(session.boto_region_name, 'xgboost', repo_version='1.0-1')
xgb = sagemaker.estimator.Estimator(container,
                                   role,
                                   train_instance_count=1,
                                   train_instance_type='ml.m4.xlarge',
                                   output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                   sagemaker_session=session)
xgb.set_hyperparameters(max_depth=5,
                       eta=0.2,
                       gamma=4,
                       min_child_weight=6,
                       subsample=0.8,
                       objective='reg:squarederror',
                       early_stopping_rounds=10,
                       num_round=200)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


### Create and fit the hyperparameter tuner

In [104]:
xgb_hyperparameter_tuner = HyperparameterTuner(estimator=xgb,
                                              objective_metric_name='validation:rmse',
                                              objective_type='Minimize',
                                              max_jobs=10,
                                              max_parallel_jobs=2,
                                              hyperparameter_ranges={
                                                  'max_depth': IntegerParameter(3,12),
                                                  'eta': ContinuousParameter(0.05, 0.5),
                                                  'min_child_weight': IntegerParameter(2,8),
                                                  'subsample': ContinuousParameter(0.5, 0.9),
                                                  'gamma': ContinuousParameter(0,10)
                                              })

In [None]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})
xgb_hyperparameter_tuner.wait()

### Retrive the best-performed model and do batch-transform on the test set

In [108]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2020-09-28 20:41:28 Starting - Preparing the instances for training
2020-09-28 20:41:28 Downloading - Downloading input data
2020-09-28 20:41:28 Training - Training image download completed. Training in progress.
2020-09-28 20:41:28 Uploading - Uploading generated training model
2020-09-28 20:41:28 Completed - Training job completed[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter _tuning_objective_metric value validation:rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter 

In [109]:
xgb_transformer = xgb_attached .transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


.............................[32m2020-09-28T20:54:16.725:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m[2020-09-28:20:54:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-09-28:20:54:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-09-28:20:54:14:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[35m[2020-09-28:20:54:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2020-09-28:20:54:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2020-09-28:20:54:14:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;
[0m
[35mworker_rlimit_nofile 4096;
[0m
[35mevents {
  worker_connections 2048;[0m
[35m}

In [110]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

Completed 5.4 KiB/5.4 KiB (95.6 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-2-668015539882/sagemaker-xgboost-200928-2025-009-c5ae0-2020-09-28-20-49-34-958/test.csv.out to data/test.csv.out


In [111]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
np.sqrt(metrics.mean_squared_error(y_test, predictions.values))

0.6503359194361048

After the hyperparametering tuning, the XGBoost's accuracy increased

In [112]:
metrics.mean_absolute_error(np.exp(y_test), np.exp(predictions))

1350.4417627509126