In [None]:
import sagemaker
import boto3
import numpy as np                                
import pandas as pd                               
import os                                         
from sagemaker import get_execution_role

# Get default bucket
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-xgboost-dm'

# Get SageMaker Execution Role
role = get_execution_role()
region = boto3.Session().region_name

In [None]:
# Define boto session and SageMaker Client
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)

---

## Data

\[Moro et al., 2014\] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014


### Transformation

The transformations steps were made in DataWrangler & Features Store

In [None]:
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

# Define feature store runtime client
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

# This is where you will put the name of the Feature group you just created 
feature_group_name = "FEATURE_GROUP_NAME"
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [None]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "' + fs_table + '"'
print('Running ' + query_string)

In [None]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location = 's3://' + bucket + '/' + prefix + '/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

In [None]:
model_data = model_data.drop(['fs_id', 'fs_time', 'write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [None]:
model_data

When building a model whose primary goal is to predict a target value on new data, it is important to understand overfitting.  Supervised learning models are designed to minimize error between their predictions of the target value and actuals, in the data they are given.  This last part is key, as frequently in their quest for greater accuracy, machine learning models bias themselves toward picking up on minor idiosyncrasies within the data they are shown.  These idiosyncrasies then don't repeat themselves in subsequent data, meaning those predictions can actually be made less accurate, at the expense of more accurate predictions in the training phase.

The most common way of preventing this is to build models with the concept that a model shouldn't only be judged on its fit to the data it was trained on, but also on "new" data.  There are several different ways of operationalizing this, holdout validation, cross-validation, leave-one-out validation, etc.  For our purposes, we'll simply randomly split the data into 3 uneven groups.  The model will be trained on 70% of data, it will then be evaluated on 20% of data to give us an estimate of the accuracy we hope to have on "new" data, and 10% will be held back as a final testing dataset which will be used later on.

In [None]:
# Randomly sort the data then split out first 70%, second 20%, and last 10%
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))]
)   

In [None]:
pd.concat(
    [
        train_data['y_yes'], 
        train_data.drop(['y_no', 'y_yes'], axis=1)
    ],
    axis=1
).to_csv('train.csv', index=False, header=False)

pd.concat(
    [
        validation_data['y_yes'], 
        validation_data.drop(['y_no', 'y_yes'], axis=1)
    ],
    axis=1
).to_csv('validation.csv', index=False, header=False)


test_data.to_csv('test.csv',index=False,  header=True)

In [None]:
s3_resource = boto_session.resource('s3')

s3_resource.Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_resource.Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
s3_resource.Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')