In [2]:
import warnings, requests, zipfile, io
warnings.simplefilter('ignore')
import pandas as pd
from scipy.io import arff

import os
import boto3
import sagemaker
from sagemaker.image_uris import retrieve
from sklearn.model_selection import train_test_split

In [3]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = boto3.Session().region_name

In [4]:
data_bucket = sess.default_bucket()
data_prefix = "notebook/taxi/data-csv"
#output_bucket = data_bucket
#output_prefix = "sagemaker/duration-regression"


In [5]:
df = pd.read_csv("train.csv")

In [6]:
cols = df.columns.tolist()

In [7]:
cols = [cols[17]] + [cols[1]] + cols[4:17] + cols[18:]
df = df[cols]

In [8]:
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42)
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42)


In [9]:
prefix = 'NYC'

train_file='nyc_train.csv'
test_file='nyc_test.csv'
validate_file='nyc_validate.csv'

In [10]:
s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False )
    s3_resource.Bucket(data_bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

In [11]:
upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

In [12]:
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

In [13]:
hyperparams={"num_round":"42",
             "eval_metric": "rmse"}

s3_output_location="s3://{}/{}/output/".format(data_bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

In [14]:
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(data_bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(data_bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

In [None]:
xgb_model.fit(inputs=data_channels, logs=False)

print('ready for hosting!')


2022-12-03 22:14:10 Starting - Starting the training job......
2022-12-03 22:14:45 Starting - Preparing the instances for training..................
2022-12-03 22:16:21 Downloading - Downloading input data......
2022-12-03 22:16:56 Training - Downloading the training image..........
2022-12-03 22:17:52 Training - Training image download completed. Training in progress........
2022-12-03 22:18:33 Uploading - Uploading generated training model..

In [None]:
xgb_model.deploy(initial_instance_count=1, intance_type='ml.c4.xlarge')
print('Endpoint name:', xgb_model.endpoint_name)