In [1]:
data_bucket = 'ml-automation'
subfolder = 'katana02'
dataset = 'invoice_data_adjusted.csv'

In [2]:
import pandas as pd
import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [3]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df.head()

Unnamed: 0,invoice_risk_decision,customer,invoice_date_year,invoice_date_quarter,invoice_date_month,invoice_date_week,invoice_date_day_year,invoice_date_day_month,invoice_date_day_week,payment_due_date_year,...,payment_due_date_day_month,payment_due_date_day_week,payment_date_year,payment_date_quarter,payment_date_month,payment_date_week,payment_date_day_year,payment_date_day_month,payment_date_day_week,grand_total
0,0,24_customer,2017,4,11,46,318,14,2,2017.0,...,16.0,4.0,2017.0,4.0,11.0,46.0,320.0,16.0,4.0,64.25
1,0,11_customer,2017,4,11,46,318,14,2,2017.0,...,17.0,5.0,2017.0,4.0,11.0,46.0,321.0,17.0,5.0,50.34
2,0,29_customer,2017,4,12,49,338,4,1,2017.0,...,8.0,5.0,2017.0,4.0,12.0,49.0,342.0,8.0,5.0,40.03
3,0,28_customer,2017,4,12,49,338,4,1,2017.0,...,8.0,5.0,2017.0,4.0,12.0,49.0,340.0,6.0,3.0,94.86
4,0,13_customer,2017,4,12,49,340,6,3,2017.0,...,8.0,5.0,2017.0,4.0,12.0,50.0,348.0,14.0,4.0,65.15


In [4]:
print('Number of rows in dataset: {df.shape[0]}')
print(df[df.columns[0]].value_counts())

Number of rows in dataset: {df.shape[0]}
0    183
1    120
Name: invoice_risk_decision, dtype: int64


In [5]:
encoded_data = pd.get_dummies(df)
encoded_data.head()

Unnamed: 0,invoice_risk_decision,invoice_date_year,invoice_date_quarter,invoice_date_month,invoice_date_week,invoice_date_day_year,invoice_date_day_month,invoice_date_day_week,payment_due_date_year,payment_due_date_quarter,...,customer_46_customer,customer_47_customer,customer_48_customer,customer_49_customer,customer_4_customer,customer_50_customer,customer_6_customer,customer_7_customer,customer_8_customer,customer_9_customer
0,0,2017,4,11,46,318,14,2,2017.0,4.0,...,0,0,0,0,0,0,0,0,0,0
1,0,2017,4,11,46,318,14,2,2017.0,4.0,...,0,0,0,0,0,0,0,0,0,0
2,0,2017,4,12,49,338,4,1,2017.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,0,2017,4,12,49,338,4,1,2017.0,4.0,...,0,0,0,0,0,0,0,0,0,0
4,0,2017,4,12,49,340,6,3,2017.0,4.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
corrs = encoded_data.corr()['invoice_risk_decision'].abs()
columns = corrs[corrs > .1].index
corrs = corrs.filter(columns)
corrs

invoice_risk_decision        1.000000
invoice_date_year            0.204019
invoice_date_quarter         0.237271
invoice_date_month           0.234815
invoice_date_week            0.233312
invoice_date_day_year        0.235755
invoice_date_day_week        0.142722
payment_due_date_year        0.177947
payment_due_date_quarter     0.184797
payment_due_date_month       0.202106
payment_due_date_week        0.211010
payment_due_date_day_year    0.209967
payment_due_date_day_week    0.119327
payment_date_year            0.247079
payment_date_month           0.105959
payment_date_week            0.109242
payment_date_day_year        0.110940
grand_total                  0.237612
customer_11_customer         0.229656
customer_21_customer         0.111826
customer_43_customer         0.106990
customer_50_customer         0.164441
Name: invoice_risk_decision, dtype: float64

In [7]:
train_df, val_and_test_data = train_test_split(encoded_data, test_size=0.3, random_state=0)
val_df, test_df = train_test_split(val_and_test_data, test_size=0.333, random_state=0)

train_data = train_df.to_csv(None, header=False, index=False).encode()
val_data = val_df.to_csv(None, header=False, index=False).encode()
test_data = test_df.to_csv(None, header=True, index=False).encode()

with s3.open(f'{data_bucket}/{subfolder}/processed/train.csv', 'wb') as f:
    f.write(train_data)

with s3.open(f'{data_bucket}/{subfolder}/processed/val.csv', 'wb') as f:
    f.write(val_data) 
    
with s3.open(f'{data_bucket}/{subfolder}/processed/test.csv', 'wb') as f:
    f.write(test_data) 
    
train_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.s3_input(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')

In [8]:
sess = sagemaker.Session()

containers = {
                'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
                'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
                'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
                'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'
             }

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name], 
                                    role,
                                    train_instance_count=1, 
                                    train_instance_type='ml.m5.large',
                                    output_path=f's3://{data_bucket}/{subfolder}/output',
                                    sagemaker_session=sess)

estimator.set_hyperparameters(max_depth=3,
                        subsample=0.7,
                        objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100,
                        early_stopping_rounds=10)

estimator.fit({'train': train_input, 'validation': val_input})

INFO:sagemaker:Creating training-job with name: xgboost-2018-11-29-20-52-30-186


2018-11-29 20:52:30 Starting - Starting the training job...
2018-11-29 20:52:32 Starting - Launching requested ML instances......
2018-11-29 20:53:37 Starting - Preparing the instances for training......
2018-11-29 20:54:51 Downloading - Downloading input data..
[31mArguments: train[0m
[31m[2018-11-29:20:55:06:INFO] Running standalone xgboost training.[0m
[31m[2018-11-29:20:55:06:INFO] File size need to be processed in the node: 0.05mb. Available memory size in the node: 159.51mb[0m
[31m[2018-11-29:20:55:06:INFO] Determined delimiter of CSV input is ','[0m
[31m[20:55:06] S3DistributionType set as FullyReplicated[0m
[31m[20:55:06] 212x62 matrix with 13144 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2018-11-29:20:55:06:INFO] Determined delimiter of CSV input is ','[0m
[31m[20:55:06] S3DistributionType set as FullyReplicated[0m
[31m[20:55:06] 60x62 matrix with 3720 entries loaded from /opt/ml/input/data/validation?format=csv&