# 1. Import Libraries

In [41]:
# General Data Manipulation Libraries
import numpy as np
import pandas as pd

# Model & Helper Libraries
import os
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Plotting Tools
import matplotlib.pyplot as plt

# Sagemaker Unique Libraries
import sagemaker
import boto3
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.inputs import TrainingInput

# 2. Configure Boto3 Clients and Sessions

In [4]:
region = boto3.Session().region_name
smclient = boto3.Session().client("sagemaker")

role = sagemaker.get_execution_role()

bucket = "sagemaker-santander"

print(f'AWS Region name : {region},\nSession : {smclient},\nRole : {role}')

AWS Region name : us-east-1,
Session : <botocore.client.SageMaker object at 0x7f763e015690>,
Role : arn:aws:iam::982563744386:role/service-role/AmazonSageMaker-ExecutionRole-20210825T133419


# 3. Load Data

In [15]:
# Print Files in Input Training Directory
s3client = boto3.client('s3') # S3 Client
subfolder = 'input-data' # Subfolder
contents = s3client.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])

input-data/
input-data/train.csv


In [27]:
input_file = 'input-data/train.csv'
response = s3client.get_object(Bucket=bucket, Key=input_file)
df = pd.read_csv(io.BytesIO(response['Body'].read()))

df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


# 4. Data Preperation

In [29]:
var_colums = [c for c in df.columns if c not in ['ID_code','target']]
X = df.loc[:, var_colums]
y = df.loc[:, 'target']

# We are performing a 80-20 split for Training and Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((160000, 200), (40000, 200), (160000,), (40000,))

In [35]:
# Construct Train and Validation Dataframes
pd.concat([X_train, y_train], axis=1).to_csv(
    "train.csv", index=False
)
pd.concat([X_valid, y_valid], axis=1).to_csv(
    "validation.csv", index=False
)

In [36]:
# Copy the files to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    "train.csv"
).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    "validation.csv"
).upload_file("validation.csv")

In [37]:
pd.read_csv('train.csv')

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,target
0,10.4746,2.8536,9.2575,10.3119,8.8062,3.8746,4.4889,16.5655,1.7723,5.5230,...,6.5295,4.3429,7.6501,20.7617,-1.1457,10.3601,8.6280,10.3478,-23.6433,0
1,7.4244,-8.2477,9.8266,7.0070,12.8404,-2.8357,3.8981,17.0497,2.2556,8.7114,...,8.8321,1.7371,2.1110,14.5298,-0.9141,-4.4855,10.8376,16.3837,-14.3434,0
2,10.4971,1.3654,7.3707,10.4127,11.2232,2.6208,6.8985,14.6936,-0.1505,6.0784,...,7.5201,1.0410,-0.2928,20.9086,1.5671,12.9797,8.1749,15.0971,-14.3155,1
3,12.7748,-4.3193,12.1392,7.4885,11.8176,0.1426,6.3390,15.1036,6.5895,7.6974,...,5.9850,3.7650,3.5007,14.0953,-1.0699,8.9619,8.0200,7.9897,11.5895,0
4,9.3564,0.8452,12.8589,6.1526,12.6412,-8.5941,4.3823,15.5058,-6.8438,6.2642,...,10.0608,2.1277,0.6450,11.0313,-0.9648,-3.0415,9.0056,19.0639,-18.8837,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,9.5052,-6.6869,7.9066,6.3231,8.7790,6.0230,6.3153,16.5278,1.8784,6.1573,...,10.4833,0.6736,-3.1411,19.0313,-1.5687,-2.3488,9.5394,11.9308,-9.7107,0
159996,11.7732,1.6857,6.8726,7.7299,10.1775,-1.2316,6.4794,12.6513,-0.5254,5.1967,...,8.7401,-0.7065,1.1199,21.3090,-0.8770,8.0038,9.5741,19.0722,6.9097,0
159997,9.3690,-9.2239,13.2679,3.9702,10.7466,-7.5646,4.7061,22.1956,-0.9237,6.4399,...,3.2124,2.2692,-1.0356,13.7184,-1.6324,1.7869,10.6369,15.1946,6.2116,0
159998,15.0810,0.4257,17.0163,6.7387,11.5491,7.9170,7.3187,11.8116,0.3999,9.2455,...,7.9678,-0.5349,3.1303,16.6384,0.1540,7.1786,8.3089,20.4703,-0.1050,0


# 5. Model Setup

In [46]:
sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")


xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)

## 5.1 Hyper-paramter Tuning

In [47]:
hyperparameter_ranges = {
    "max_depth": IntegerParameter(1, 4),
}

objective_metric_name = "validation:auc"

tuner = HyperparameterTuner(
    xgb, objective_metric_name, hyperparameter_ranges, max_jobs=2, max_parallel_jobs=3
)

# 6. Model Training

In [49]:
s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/train".format(bucket),
    content_type="csv",
    input_mode="Pipe"
)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/validation".format(bucket), 
    content_type="csv",
    input_mode="Pipe"
)

tuner.fit({"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False)

............................

KeyboardInterrupt: 

# 7. Examine Results

In [None]:
results = sagemaker.analytics.HyperparameterTuningJobAnalytics(tuning_job_name)
results_df = results.dataframe()
results_df

In [None]:
best_training_job_summary = results.description()["BestTrainingJob"]
best_training_job_name = best_training_job_summary["TrainingJobName"]

%store best_training_job_name