## Deliver Time
### v1 2019-05-29

In [1]:
import json
import numpy as np
from scipy import stats
from datetime import datetime

import analitico
import analitico.plugin
import s24.plugin
from analitico.pandas import *
from analitico.schema import apply_schema

# pass api token to create sdk
sdk = analitico.authorize("tok_s24_579E5hOWw7k8")
print(datetime.utcnow())

2019-05-29 14:56:55.708834


## Training

In [2]:
# load table that is already a result of joining and processing several tables
df1 = sdk.get_dataframe("ds_s24_order_time_pick_pay_deliver")
df1.sample(n=10)

Unnamed: 0,order_amount,order_volume,order_deliver_at_start.dayofweek,order_deliver_at_start.year,order_deliver_at_start.month,order_deliver_at_start.day,order_deliver_at_start.hour,order_deliver_at_start.minute,order_deliver_at_end,order_fulfillment_type,...,customer_province,customer_lat,customer_lng,customer_area,customer_ztl,customer_ref_id,customer_has_subscription,pick_time.min,pay_time.min,deliver_time.min
172153,47.79,73.26,1,2018,8,21,16,0,2018-08-21 17:00:00,1,...,VA,45.72464,8.80588,VA1,0,237730,0,26.633333,8.4,37.5
186,65.89,0.0,1,2016,5,10,19,0,2016-05-10 20:00:00,1,...,RM,41.825095,12.454045,,0,324076,0,41.6,14.85,
106418,190.24,225.22,6,2018,3,18,14,0,2018-03-18 15:00:00,1,...,PD,45.428622,11.794958,PD1,0,543876,0,51.283333,18.95,22.133333
105772,29.64,34.92,4,2018,3,16,19,0,2018-03-16 20:00:00,1,...,PD,45.413828,11.861051,PD1,0,764781,0,8.983333,49.966667,8.533333
275242,53.62,70.39,0,2019,1,21,12,0,2019-01-21 13:00:00,1,...,RM,41.943276,12.375225,RM11,0,1379686,1,39.016667,11.8,41.783333
255506,37.19,79.03,3,2018,12,27,14,0,2018-12-27 15:00:00,1,...,MI,45.453324,9.140913,MI5,0,95694,0,9.85,13.883333,50.9
117239,68.65,105.84,1,2018,4,17,18,0,2018-04-17 19:00:00,1,...,RM,41.866729,12.446759,RM3,0,854339,0,16.483333,11.85,30.516667
211433,89.32,147.66,5,2018,10,27,19,0,2018-10-27 20:00:00,1,...,RM,41.872136,12.577698,RM6,0,1198657,0,24.783333,13.316667,46.866667
253469,84.13,121.94,5,2018,12,22,11,0,2018-12-22 12:00:00,1,...,TS,45.667712,13.768198,TS1,0,1104455,0,30.95,13.45,19.916667
209621,51.97,35.09,3,2018,10,25,16,0,2018-10-25 17:00:00,1,...,MN,45.153351,10.715323,MN1,0,1129905,0,12.866667,21.333333,14.4


In [3]:
df1.describe()

Unnamed: 0,order_amount,order_volume,order_deliver_at_start.dayofweek,order_deliver_at_start.year,order_deliver_at_start.month,order_deliver_at_start.day,order_deliver_at_start.hour,order_deliver_at_start.minute,items_total,items_with_variable_weight,store_lat,store_lng,customer_lat,customer_lng,pick_time.min,pay_time.min,deliver_time.min
count,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,280025.0,195425.0
mean,57.065407,75.075223,2.71074,2017.710915,6.988594,15.698448,14.351422,0.138959,22.008724,3.039925,44.14254,10.625622,44.142049,10.62958,25.914597,13.903314,27.464454
std,40.018748,65.127621,1.887745,0.71898,3.68417,8.593538,3.337425,2.046893,14.786909,3.702682,1.710798,1.961118,1.709494,1.960951,32.759049,10.613576,17.179451
min,0.01,0.0,0.0,2016.0,1.0,1.0,7.0,0.0,1.0,0.0,37.511822,7.40142,37.434983,7.35268,1.0,1.0,1.0
25%,30.73,30.21,1.0,2017.0,4.0,8.0,11.0,0.0,11.0,0.0,41.927061,9.121973,41.929204,9.131372,11.433333,7.883333,16.316667
50%,47.35,62.7,3.0,2018.0,7.0,16.0,14.0,0.0,19.0,2.0,45.063979,10.994936,45.06583,11.002461,20.133333,11.2,22.816667
75%,72.99,103.41,4.0,2018.0,10.0,23.0,17.0,0.0,29.0,5.0,45.456849,12.452055,45.448825,12.46364,33.033333,16.066667,32.933333
max,934.85,1214.98,6.0,2019.0,12.0,31.0,22.0,55.0,251.0,42.0,46.607851,15.169497,46.09946,26.97777,4368.883333,89.95,119.983333


In [4]:
# keep only the features we need, apply data types
df2 = apply_schema(df1, {
    "columns":[
        { "name":"order_amount", "type":"float" },
        { "name":"order_volume", "type":"float" },
        { "name":"order_deliver_at_start.dayofweek", "type":"integer" },
        { "name":"order_deliver_at_start.year", "type":"integer" },
        { "name":"order_deliver_at_start.month", "type":"integer" },
        { "name":"order_deliver_at_start.day", "type":"integer" },
        { "name":"order_deliver_at_start.hour", "type":"integer" },
        { "name":"order_deliver_at_start.minute", "type":"integer" },
        { "name":"order_fulfillment_type", "type":"category" },
        { "name":"items_total", "type":"integer" },
        { "name":"items_with_variable_weight", "type":"integer" },
        { "name":"store_name", "type":"category" },
        { "name":"store_province", "type":"category" },
        { "name":"store_lat", "type":"float" },
        { "name":"store_lng", "type":"float" },
        { "name":"store_area", "type":"category" },
        { "name":"store_ref_id", "type":"category" },
        { "name":"customer_province", "type":"category" },
        { "name":"customer_lat", "type":"float" },
        { "name":"customer_lng", "type":"float" },
        { "name":"customer_area", "type":"category" },
        { "name":"customer_ztl", "type":"category" },
        { "name":"customer_ref_id", "type":"category" },
        { "name":"customer_has_subscription", "type":"category" },
        { "name":"deliver_time.min", "type":"float" }
    ]
})
df2.sample(n=10)

Unnamed: 0,order_amount,order_volume,order_deliver_at_start.dayofweek,order_deliver_at_start.year,order_deliver_at_start.month,order_deliver_at_start.day,order_deliver_at_start.hour,order_deliver_at_start.minute,order_fulfillment_type,items_total,...,store_area,store_ref_id,customer_province,customer_lat,customer_lng,customer_area,customer_ztl,customer_ref_id,customer_has_subscription,deliver_time.min
265486,42.67,157.08,2,2019,1,9,19,0,1,20,...,TO4,5211,TO,45.075857,7.568194,TO4,0,965629,0,17.616667
52047,71.59,52.73,3,2017,7,20,15,0,1,24,...,VE1,234,VE,45.49704,12.265001,,0,414732,0,
225202,37.15,18.34,2,2018,11,21,11,0,1,14,...,TO2,1199,TO,45.076748,7.650125,TO2,0,1296373,0,14.8
240860,55.58,42.81,2,2018,12,5,14,0,1,17,...,PD1,2430,PD,45.351595,11.833764,PD1,0,1106329,0,20.066667
58150,36.74,56.65,0,2017,9,11,19,0,1,21,...,TO2,1295,TO,45.078278,7.633697,,0,647398,0,
173993,8.24,8.59,0,2018,8,27,11,0,1,7,...,PD1,5057,PD,45.403922,11.879334,PD1,20,1010377,1,45.35
196127,21.0,17.93,4,2018,10,5,12,0,1,5,...,RM1,2900,RM,41.904686,12.486069,RM1,20,611544,0,17.216667
195356,52.35,57.59,3,2018,10,4,16,0,1,14,...,MI3,4506,MI,45.497396,9.221339,MI3,0,274798,0,20.066667
126029,104.89,132.01,5,2018,5,12,12,0,1,48,...,VR1,2769,VR,45.443741,10.952243,VR1,0,975375,0,18.15
27771,101.54,0.0,5,2017,2,18,15,0,1,21,...,,3121,RM,41.926892,12.461872,,0,279904,0,


In [5]:
# train catboost sdg model, save to disk, save model metadata
training = sdk.run_plugin(df2, action="train", settings = {
    "name": "analitico.plugin.CatBoostRegressorPlugin",
        "parameters": {
          "iterations": 50,
          "learning_rate": 1
        },
        "data": {
          "label": "deliver_time.min"
        }
    })

Training data has 84600 rows without 'deliver_time.min' label


0:	learn: 16.6681369	test: 16.6192745	best: 16.6192745 (0)	total: 184ms	remaining: 9.04s
1:	learn: 16.4044878	test: 16.3162144	best: 16.3162144 (1)	total: 314ms	remaining: 7.54s
2:	learn: 16.3258964	test: 16.2403896	best: 16.2403896 (2)	total: 425ms	remaining: 6.66s
3:	learn: 16.2573933	test: 16.1742247	best: 16.1742247 (3)	total: 575ms	remaining: 6.62s
4:	learn: 16.2108795	test: 16.1424583	best: 16.1424583 (4)	total: 713ms	remaining: 6.42s
5:	learn: 16.1662287	test: 16.1255901	best: 16.1255901 (5)	total: 846ms	remaining: 6.2s
6:	learn: 16.1484497	test: 16.1198040	best: 16.1198040 (6)	total: 978ms	remaining: 6.01s
7:	learn: 16.1266113	test: 16.1203949	best: 16.1198040 (6)	total: 1.11s	remaining: 5.82s
8:	learn: 16.0924191	test: 16.1031651	best: 16.1031651 (8)	total: 1.26s	remaining: 5.75s
9:	learn: 16.0730451	test: 16.0947635	best: 16.0947635 (9)	total: 1.41s	remaining: 5.64s
10:	learn: 16.0545717	test: 16.0881385	best: 16.0881385 (10)	total: 1.54s	remaining: 5.48s
11:	learn: 16.010456

## Prediction

In [6]:
import analitico
import analitico.pandas
import pandas as pd

sdk = analitico.authorize("tok_s24_579E5hOWw7k8")

# define method used for serverless deployment of predictions
def handle(event, **kwargs):
    # convert records to pandas dataframe if needed
    if not isinstance(event, pd.DataFrame):
        event = pd.DataFrame.from_dict(event)
    
    if "order_deliver_at_start" in event.columns:
        event = analitico.pandas.augment_dates(event, column="order_deliver_at_start")
    
    # use trained model for inference
    return sdk.run_plugin(event, action="predict", settings = {
        "name": "analitico.plugin.CatBoostClassifierPlugin"
        })

## Testing

In [7]:
# extract some sample records used to test predictions
samples = df2.sample(n=3)
samples = samples.to_dict(orient="records")
print(json.dumps(samples, indent=2))

[
  {
    "order_amount": 71.51,
    "order_volume": 74.55,
    "order_deliver_at_start.dayofweek": 4,
    "order_deliver_at_start.year": 2017,
    "order_deliver_at_start.month": 12,
    "order_deliver_at_start.day": 15,
    "order_deliver_at_start.hour": 11,
    "order_deliver_at_start.minute": 0,
    "order_fulfillment_type": "1",
    "items_total": 24,
    "items_with_variable_weight": 4,
    "store_name": "conad superstore",
    "store_province": "PD",
    "store_lat": 45.416906,
    "store_lng": 11.863762,
    "store_area": "PD1",
    "store_ref_id": "5057",
    "customer_province": "PD",
    "customer_lat": 45.408612,
    "customer_lng": 11.94867,
    "customer_area": "PD1",
    "customer_ztl": "0",
    "customer_ref_id": "205034",
    "customer_has_subscription": "0",
    "deliver_time.min": 12.15
  },
  {
    "order_amount": 93.09,
    "order_volume": 119.46,
    "order_deliver_at_start.dayofweek": 0,
    "order_deliver_at_start.year": 2018,
    "order_deliver_at_start.month":

In [8]:
# run prediction test, print results
results = handle(samples)
print("Prediction results:\n")
print(json.dumps(results, indent=2))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Prediction results:

{
  "type": "analitico/prediction",
  "performance": {
    "cpu_count": 12,
    "loading_ms": 9,
    "total_ms": 383
  },
  "records": [
    {
      "order_amount": 71.51,
      "order_volume": 74.55,
      "order_deliver_at_start.dayofweek": 4,
      "order_deliver_at_start.year": 2017,
      "order_deliver_at_start.month": 12,
      "order_deliver_at_start.day": 15,
      "order_deliver_at_start.hour": 11,
      "order_deliver_at_start.minute": 0,
      "order_fulfillment_type": "1",
      "items_total": 24,
      "items_with_variable_weight": 4,
      "store_name": "conad superstore",
      "store_province": "PD",
      "store_lat": 45.416906,
      "store_lng": 11.863762,
      "store_area": "PD1",
      "store_ref_id": "5057",
      "customer_province": "PD",
      "customer_lat": 45.408612,
      "customer_lng": 11.94867,
      "customer_area": "PD1",
      "customer_ztl": "0",
      "customer_ref_id": "205034",
      "customer_has_subscription": "0",
      "

## Testing with fixed data

In [9]:
samples = [
  {
    "order_amount": 77.91,
    "order_volume": 111.3,
    "order_deliver_at_startOFF": "2019-05-30T13:30:45",
    "order_fulfillment_type": "1",
    "items_total": 19,
    "items_with_variable_weight": 0,
    "store_name": "auchan",
    "store_province": "PD",
    "store_lat": 45.410906,
    "store_lng": 11.906148,
    "store_area": "PD1",
    "store_ref_id": "233",
    "customer_province": "PD",
    "customer_lat": 45.3968237,
    "customer_lng": 11.866805000000001,
    "customer_area": "PD1",
    "customer_ztl": "0",
    "customer_ref_id": "519563",
    "customer_has_subscription": "0",
    "deliver_time.min": 27.466666666666665
  },
  {
    "order_amount": 97.35,
    "order_volume": 146.3,
    "order_deliver_at_startOFF": "2019-05-30T08:30:45",
    "order_fulfillment_type": "1",
    "items_total": 28,
    "items_with_variable_weight": 13,
    "store_name": "esselunga",
    "store_province": "BO",
    "store_lat": 44.50543752,
    "store_lng": 11.30574892,
    "store_area": "BO1",
    "store_ref_id": "3736",
    "customer_province": "BO",
    "customer_lat": 44.4890757,
    "customer_lng": 11.3513898,
    "customer_area": "BO1",
    "customer_ztl": "20",
    "customer_ref_id": "912049",
    "customer_has_subscription": "0",
    "deliver_time.min": 53.18333333333333
  }
]

In [10]:
# run prediction test, print results
results = handle(samples)
print("Prediction results:\n")
print(json.dumps(results, indent=2))

Prediction results:

{
  "type": "analitico/prediction",
  "performance": {
    "cpu_count": 12,
    "loading_ms": 7,
    "total_ms": 385
  },
  "records": [
    {
      "order_amount": 77.91,
      "order_volume": 111.3,
      "order_deliver_at_start.dayofweek": 0,
      "order_deliver_at_start.year": 0,
      "order_deliver_at_start.month": 0,
      "order_deliver_at_start.day": 0,
      "order_deliver_at_start.hour": 0,
      "order_deliver_at_start.minute": 0,
      "order_fulfillment_type": "1",
      "items_total": 19,
      "items_with_variable_weight": 0,
      "store_name": "auchan",
      "store_province": "PD",
      "store_lat": 45.410906,
      "store_lng": 11.906148,
      "store_area": "PD1",
      "store_ref_id": "233",
      "customer_province": "PD",
      "customer_lat": 45.396824,
      "customer_lng": 11.866805,
      "customer_area": "PD1",
      "customer_ztl": "0",
      "customer_ref_id": "519563",
      "customer_has_subscription": "0",
      "deliver_time.min