In [None]:
bucket = 'S3BucketName'
prefix = 'S3Prefix'

# import needed Python libraries
import boto3
import re
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import sagemaker
from sagemaker.predictor import csv_serializer

# Define IAM role
role = get_execution_role()

# Read in the sample customer data
churn = pd.read_csv('./ChurnSampleData.csv')

# Begin feature engineering
churn = churn.drop(['EndpointID','EmailAddress'], axis=1)

# Transform unix timestamp event columns into buckets
#   0 = more than 180 days ago
#   1 = more than 30 days ago
#   2 = less than or equal to 30 days ago
bucket_start = date(2019,10,25)
for index, row in churn.iterrows():
    ts = date.fromtimestamp(row['LastEmailDelivered'])
    if ts < bucket_start - timedelta(days=180):
        val = 0
    elif ts < bucket_start - timedelta(days=30):
        val = 1
    else:
        val = 2
    row['LastEmailDelivered'] = val
    
    ts = date.fromtimestamp(row['LastEmailEngaged'])
    if ts < bucket_start - timedelta(days=180):
        val = 0
    elif ts < bucket_start - timedelta(days=30):
        val = 1
    else:
        val = 2
    row['LastEmailEngaged'] = val

# format our model data with predictor variable in first column
model_data = pd.concat([churn['Churn?'], churn.drop(['Churn?'], axis=1)], axis=1)

# split data into training, validation, and test sets
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)
model_data.to_csv('model_data.csv', header=True, index=False)

# upload to S3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

# begin model training
container = get_image_uri(boto3.Session().region_name, 'xgboost', '0.90-1')
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    train_instance_count=1,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output2'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

# compile model
client = boto3.client('sagemaker')

info = client.describe_training_job(TrainingJobName=xgb.latest_training_job.name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

# host model
client.create_model(
    ModelName = 'deployed-xgboost-customer-churn',
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)