## Deliver Time
### v1 2019-05-28

In [1]:
import json
import numpy as np
from scipy import stats
from datetime import datetime

import analitico
import analitico.plugin
import s24.plugin
from analitico.pandas import *
from analitico.schema import apply_schema

# pass api token to create sdk
sdk = analitico.authorize("tok_s24_579E5hOWw7k8")
print(datetime.utcnow())

2019-05-28 09:54:21.722030


## Training

In [2]:
# load table that is already a result of joining and processing several tables
df1 = sdk.get_dataframe("ds_s24_order_time_pick_pay_deliver")
df1.sample(n=10)

Unnamed: 0,order_amount,order_volume,order_deliver_at_start.dayofweek,order_deliver_at_start.year,order_deliver_at_start.month,order_deliver_at_start.day,order_deliver_at_start.hour,order_deliver_at_start.minute,order_deliver_at_end,order_fulfillment_type,...,customer_province,customer_lat,customer_lng,customer_area,customer_ztl,customer_ref_id,customer_has_subscription,pick_time.min,pay_time.min,deliver_time.min
144947,37.29,63.74,3,2018,6,21,14,0,2018-06-21 15:00:00,1,...,VR,45.438678,11.027072,VR1,0,802081,0,25.333333,13.333333,25.116667
239210,48.64,102.03,0,2018,12,3,18,0,2018-12-03 19:00:00,1,...,RM,41.912905,12.525354,RM7,0,1202313,0,9.1,7.45,37.483333
139634,108.03,105.73,5,2018,6,9,19,0,2018-06-09 20:00:00,1,...,MI,45.441122,9.089437,MI6,0,716653,0,4.25,5.8,29.733333
125991,10.34,26.92,4,2018,5,11,11,0,2018-05-11 12:00:00,1,...,RM,41.899916,12.386181,RM11,0,964485,0,12.166667,21.283333,19.233333
94144,33.12,17.51,1,2018,2,13,14,0,2018-02-13 15:00:00,1,...,VR,45.422281,10.965955,VR1,0,321734,0,14.566667,12.066667,26.033333
112930,59.76,132.2,4,2018,4,6,15,0,2018-04-06 16:00:00,1,...,RM,41.89931,12.563156,RM6,0,931873,0,11.733333,8.3,24.55
261695,45.13,54.25,5,2019,1,5,18,0,2019-01-05 19:00:00,1,...,MN,45.157405,10.793183,MN1,20,1146029,0,15.333333,24.233333,41.216667
216657,35.77,18.69,0,2018,11,5,18,0,2018-11-05 19:00:00,1,...,RN,44.04993,12.57672,RN1,0,1105265,0,14.483333,20.35,9.233333
138639,33.47,47.2,3,2018,6,7,11,0,2018-06-07 12:00:00,1,...,RM,41.894204,12.555171,RM6,0,970847,0,8.816667,8.283333,25.633333
180832,30.2,17.85,0,2018,9,10,12,0,2018-09-10 13:00:00,1,...,VR,45.39068,10.91377,VR2,0,993539,0,11.716667,16.783333,4.316667


In [3]:
df1.describe()

Unnamed: 0,order_amount,order_volume,order_deliver_at_start.dayofweek,order_deliver_at_start.year,order_deliver_at_start.month,order_deliver_at_start.day,order_deliver_at_start.hour,order_deliver_at_start.minute,items_total,items_with_variable_weight,store_lat,store_lng,customer_lat,customer_lng,pick_time.min,pay_time.min,deliver_time.min
count,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,195425.0
mean,57.065407,75.075223,2.71074,2017.710915,6.988594,15.698448,14.351422,0.138959,22.008724,3.039925,44.14254,10.625622,44.142049,10.62958,25.914597,13.903314,27.464454
std,40.018748,65.127621,1.887745,0.71898,3.68417,8.593538,3.337425,2.046893,14.786909,3.702682,1.710798,1.961118,1.709494,1.960951,32.759049,10.613576,17.179451
min,0.01,0.0,0.0,2016.0,1.0,1.0,7.0,0.0,1.0,0.0,37.511822,7.40142,37.434983,7.35268,1.0,1.0,1.0
25%,30.73,30.21,1.0,2017.0,4.0,8.0,11.0,0.0,11.0,0.0,41.927061,9.121973,41.929204,9.131372,11.433333,7.883333,16.316667
50%,47.35,62.7,3.0,2018.0,7.0,16.0,14.0,0.0,19.0,2.0,45.063979,10.994936,45.06583,11.002461,20.133333,11.2,22.816667
75%,72.99,103.41,4.0,2018.0,10.0,23.0,17.0,0.0,29.0,5.0,45.456849,12.452055,45.448825,12.46364,33.033333,16.066667,32.933333
max,934.85,1214.98,6.0,2019.0,12.0,31.0,22.0,55.0,251.0,42.0,46.607851,15.169497,46.09946,26.97777,4368.883333,89.95,119.983333


In [4]:
# keep only the features we need, apply data types
df2 = apply_schema(df1, {
    "columns":[
        { "name":"order_amount", "type":"float" },
        { "name":"order_volume", "type":"float" },
        { "name":"order_deliver_at_start.dayofweek", "type":"integer" },
        { "name":"order_deliver_at_start.year", "type":"integer" },
        { "name":"order_deliver_at_start.month", "type":"integer" },
        { "name":"order_deliver_at_start.day", "type":"integer" },
        { "name":"order_deliver_at_start.hour", "type":"integer" },
        { "name":"order_deliver_at_start.minute", "type":"integer" },
        { "name":"order_fulfillment_type", "type":"category" },
        { "name":"items_total", "type":"integer" },
        { "name":"items_with_variable_weight", "type":"integer" },
        { "name":"store_name", "type":"category" },
        { "name":"store_province", "type":"category" },
        { "name":"store_lat", "type":"float" },
        { "name":"store_lng", "type":"float" },
        { "name":"store_area", "type":"category" },
        { "name":"store_ref_id", "type":"category" },
        { "name":"customer_province", "type":"category" },
        { "name":"customer_lat", "type":"float" },
        { "name":"customer_lng", "type":"float" },
        { "name":"customer_area", "type":"category" },
        { "name":"customer_ztl", "type":"category" },
        { "name":"customer_ref_id", "type":"category" },
        { "name":"customer_has_subscription", "type":"category" },
        { "name":"deliver_time.min", "type":"float" }
    ]
})
df2.samples(n=10)

AttributeError: 'DataFrame' object has no attribute 'samples'

In [None]:
# train catboost sdg model, save to disk, save model metadata
training = sdk.run_plugin(df2, action="train", settings = {
    "name": "analitico.plugin.CatBoostClassifierPlugin",
        "parameters": {
          "iterations": 50,
          "learning_rate": 1
        },
        "data": {
          "label": "deliver_time.min"
        }
    })

## Prediction

In [None]:
# define method used for serverless deployment of predictions
def handle(event, **kwargs):
    # convert records to pandas dataframe if needed
    if not isinstance(event, pd.DataFrame):
        event = pd.DataFrame.from_dict(event)
    
    # use trained model for inference
    return sdk.run_plugin(event, action="predict", settings = {
        "name": "analitico.plugin.CatBoostClassifierPlugin"
        })

## Testing

In [None]:
# extract some sample records used to test predictions
samples = df2.sample(n=3)
samples = samples.to_dict(orient="records")
print(json.dumps(samples, indent=2))

In [None]:
# run prediction test, print results
results = handle(samples)
print("Prediction results:\n")
print(json.dumps(results, indent=2))