# Initialize Notebook

In [None]:
import os
import boto3
import re
import sagemaker

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

bucket = 'slytherins-test'
prefix = 'xgboost'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)

# Download Data

In [None]:
import pandas as pd

input_data = 's3://slytherins-test/Train.csv'
data = pd.read_csv(input_data)
data.head(n=10)

# Process Data for Classification Column Generation

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

for i in data.Item_Type.value_counts().index:
    data.loc[(data['Item_Weight'].isna()) & (data['Item_Type'] == i), ['Item_Weight']] = \
    data.loc[data['Item_Type'] == 'Fruits and Vegetables', ['Item_Weight']].mean()[0]

cat_data = data.select_dtypes(object)
num_data = data.select_dtypes(np.number)

cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Grocery Store'), ['Outlet_Size']] = 'Small'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type1'), ['Outlet_Size']] = 'Small'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type2'), ['Outlet_Size']] = 'Medium'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type3'), ['Outlet_Size']] = 'Medium'

cat_data.loc[cat_data['Item_Fat_Content'] == 'LF' , ['Item_Fat_Content']] = 'Low Fat'
cat_data.loc[cat_data['Item_Fat_Content'] == 'reg' , ['Item_Fat_Content']] = 'Regular'
cat_data.loc[cat_data['Item_Fat_Content'] == 'low fat' , ['Item_Fat_Content']] = 'Low Fat'

le = LabelEncoder()
cat_data = cat_data.apply(le.fit_transform)
ss = StandardScaler()
num_data = pd.DataFrame(ss.fit_transform(num_data), columns = num_data.columns)
cat_data = pd.DataFrame(ss.fit_transform(cat_data), columns = cat_data.columns)
final_data = pd.concat([num_data,cat_data],axis=1)

print('Data after cleaning: {}'.format(final_data.shape))

X = final_data.drop(['Item_Outlet_Sales'], axis=1)
y = data[['Item_Outlet_Sales']]

In [None]:
y_binned = pd.cut(y['Item_Outlet_Sales'], 4, labels=['A', 'B', 'C', 'D'])

In [None]:
temp = le.fit(y_binned)

In [None]:
y_final = temp.transform(y_binned)

In [None]:
data_final = pd.concat([pd.Series(y_final), X], axis=1)

In [None]:
data_final.to_csv('train.csv', header=None, index=False)
data_final.to_csv('validation.csv', header=None, index=False)

# Upload Data to S3

In [None]:
boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(prefix + '/train.csv').upload_file('train.csv')
boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(prefix + '/validation.csv').upload_file('validation.csv')

# Initializing Container

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost', '1.0-1')

# Training the Model

In [None]:
%%time
import boto3
from time import gmtime, strftime

job_name = 'xgboost-classification-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": bucket_path + "/" + prefix + "/single-xgboost"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"multi:softmax",
        "num_round":"50",
        "num_class":"4"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker', region_name=region)
client.create_training_job(**create_training_params)

import time

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)

# Creating Model

In [None]:
import boto3
from time import gmtime, strftime

model_name="xgboost-bigmart"

info = client.describe_training_job(TrainingJobName=job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

# Creating Endpoint Configuration

In [None]:
from time import gmtime, strftime

endpoint_config_name = 'DEMO-XGBoostEndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m5.xlarge',
        'InitialVariantWeight':1,
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

# Create Endpoint

In [None]:
create_endpoint_response = client.create_endpoint(
    EndpointName="xgboost-bigmart-endpoint",
    EndpointConfigName="xgboost-bigmart-config")


# Define Runtime Client

In [None]:
runtime_client = boto3.client('runtime.sagemaker', region_name=region)

# Get the Predictions

In [None]:
response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='text/csv', 
                                   Body=test_data)

# Print the Labels Predicted

In [None]:
result = response['Body'].read()
result = result.decode("utf-8")
result = result.split(',')
result = [math.ceil(float(i)) for i in result]
label = payload.strip(' ').split()[0]
print ('Label: ',label,'\nPrediction: ', result[0])

# Delete Endpoints

In [None]:
client.delete_endpoint(EndpointName=endpoint_name)