In [None]:
#1
import analitico
import analitico.plugin
import s24.plugin

from analitico.pandas import *

import missingno as msno
%matplotlib inline

import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = 40

# pass api token to create factory
factory = analitico.authorize("tok_demo2_xaffg23443d1")

def sample(df):
    return df.sample(n=4)

import datetime
print(datetime.datetime.now())

In [None]:
factory.get_artifacts_directory()

In [None]:
action = "train"

## Load orders

In [None]:
orders = factory.run_plugin(action=action, settings = {
    "name": "analitico.plugin.DatasetSourcePlugin",
    "dataset_id": "ds_s24_order"
})

In [None]:
sample(orders)

In [None]:
hist = orders[["status"]].hist(bins=range(0,700,50))

In [None]:
print("Total orders: %d" % len(orders))
orders = orders.loc[orders["status"] >= 400].copy() # only status over 400
orders = orders.loc[orders["deleted"] == 0].copy() # not deleted
print("Valid orders: %d" % len(orders))
sample(orders)

In [None]:
msno.matrix(orders)

In [None]:
# rename fields and keep only the columns we need
orders = factory.run_plugin(orders, action=action, settings = {
    "name": "analitico.plugin.TransformDataframePlugin",
    "schema": {
        "columns": [
            { "name": "id", "rename": "order_id", "index": True },
            { "name": "amount", "rename": "order_amount" },
            { "name": "volume", "rename": "order_volume" },
            { "name": "deliver_at_start", "rename": "order_deliver_at_start", "type": "datetime" },
            { "name": "deliver_at_end", "rename": "order_deliver_at_end", "type": "datetime" },
            { "name": "paid_at", "rename": "order_paid_at" },
            { "name": "delivered_at", "rename": "order_delivered_at" },
            { "name": "fulfillment_type", "rename": "order_fulfillment_type", "type": "category" },
            { "name": "store_id", "type": "category" },
            { "name": "courier_id", "type": "integer", "type": "category" },
            { "name": "picker_id", "type": "category" },
            { "name": "customer_id", "type": "category" }
        ]
    }
})

In [None]:
sample(orders)

## Load order details

In [None]:
details = factory.run_plugin(action=action, settings = {
    "name": "analitico.plugin.DatasetSourcePlugin",
    "dataset_id": "ds_s24_order_detail"
})

In [None]:
sample(details)

In [None]:
print("Total order details: %d" % len(details))
pd_print_nulls(details)

In [None]:
msno.matrix(details)

In [None]:
# rename order_detail fields to odt_xxx, keep only columns we need
details = factory.run_plugin(details, action=action, settings = {
    "name": "analitico.plugin.TransformDataframePlugin",
    "schema": {
        "columns": [
            { "name": "order_id" },
            { "name": "id", "rename": "odt_id", "type": "category" },
            { "name": "ean", "rename": "odt_ean", "type": "category" },
            { "name": "name", "rename": "odt_name", "type": "category" },
            { "name": "category_id", "rename": "odt_category_id", "type": "category" },
            { "name": "replaceable", "rename": "odt_replaceable", "type": "category" },
            { "name": "variable_weight", "rename": "odt_variable_weight" },
            { "name": "price", "rename": "odt_price" },
            { "name": "price_per_type", "rename": "odt_price_per_type" },
            { "name": "surcharge_fixed", "rename": "odt_surcharge_fixed" },
            { "name": "touched_at", "rename": "odt_touched_at" },
            { "name": "status", "rename": "odt_status", "type": "category" }
        ]
    }
})

In [None]:
sample(details)

In [None]:
details.columns

## Aggregate order details

In [None]:
# count number of items in each order
dt1 = details.order_id.value_counts().reset_index(name='odt_items_total')
dt1.rename(index=str, columns={"index": "order_id"}, inplace=True)
dt1.set_index("order_id", inplace=True)

In [None]:
# count number of items with variable weight
details_by_order = details.groupby(['order_id'])
dt2 = details_by_order['odt_variable_weight'].sum().reset_index(name='odt_items_with_variable_weight')
dt2['odt_items_with_variable_weight'] = dt2['odt_items_with_variable_weight'].astype("int")
dt2.set_index("order_id", inplace=True)

In [None]:
# find first and last timestamp
details_touched = details.dropna(subset=["odt_touched_at"])
details_grouped = details_touched.groupby(['order_id'])

dt_first = details_grouped['odt_touched_at'].min().reset_index(name='odt_first_touched_at')
dt_first.set_index("order_id", inplace=True)

dt_last = details_grouped['odt_touched_at'].max().reset_index(name='odt_last_touched_at')
dt_last.set_index("order_id", inplace=True)

dt3 = pd.merge(dt_first, dt_last, on="order_id", how="outer")

In [None]:
# merge it all together
aggregates = pd.merge(dt1, dt2, on="order_id", how="outer")
aggregates = pd.merge(aggregates, dt3, on="order_id", how="outer")
print("order detail aggregates: %d" % len(aggregates))
sample(aggregates)

aggregates.sort_values(by='order_id', ascending=True, inplace=True)

In [None]:
print("Number of items per order")
hist = aggregates[["odt_items_total"]].hist(bins=range(0,50,1))

In [None]:
print("Number of variable weight items per order")
hist = aggregates[["odt_items_with_variable_weight"]].hist(bins=range(0,20,1))

## Load stores

In [None]:
stores = factory.run_plugin(action=action, settings=[
    {
        "name": "analitico.plugin.DatasetSourcePlugin",
        "dataset_id": "ds_s24_store"
    },
    {
        "name": "analitico.plugin.TransformDataframePlugin",
        "schema": {
            "columns": [
                { "name": "id", "rename": "store_id", "type": "category" },
                { "name": "name", "rename": "store_name", "type": "category" },
                { "name": "area", "rename": "store_area", "type": "category" },
                { "name": "province", "rename": "store_province", "type": "category" },
                { "name": "lat", "rename": "store_lat" },
                { "name": "lng", "rename": "store_lng" },
                { "name": "ref_id", "rename": "store_ref_id", "type": "category" }
            ]
        }
    }
])

In [None]:
sample(stores)

## Load couriers

In [None]:
couriers = factory.run_plugin(action=action, settings=[
    {
        "name": "analitico.plugin.DatasetSourcePlugin",
        "dataset_id": "ds_s24_courier"
    },
    {
        "name": "analitico.plugin.TransformDataframePlugin",
        "schema": {
            "columns": [
                { "name": "id", "rename": "courier_id", "index": True, "type": "category" },
                { "name": "soldo_enabled", "rename": "courier_soldo_enabled", "type": "category" },
                { "name": "area", "rename": "courier_area", "type": "category" },
                { "name": "orders_taken", "rename": "courier_orders_taken" },
                { "name": "orders_sent", "rename": "courier_orders_sent" },
                { "name": "created_at", "rename": "courier_created_at", "type": "datetime" }
            ]
        }
    }
])

In [None]:
sample(couriers)

## Load customers

In [None]:
customers = factory.run_plugin(action=action, settings={
    "name": "analitico.plugin.DatasetSourcePlugin",
    "dataset_id": "ds_s24_customer"
})

In [None]:
sample(customers)

In [None]:
customers = factory.run_plugin(customers, action=action, settings={
    "name": "analitico.plugin.TransformDataframePlugin",
    "schema": {
        "columns": [
            { "name": "id", "rename": "customer_id", "type": "category", "index": True },
            { "name": "province", "rename": "customer_province", "type": "category" },
            { "name": "lat", "rename": "customer_lat" },
            { "name": "lng", "rename": "customer_lng" },
            { "name": "area", "rename": "customer_area", "type": "category" },
            { "name": "ztl", "rename": "customer_ztl", "type": "category" },
            { "name": "ref_id", "rename": "customer_ref_id", "type": "category" },
            { "name": "created_at", "rename": "customer_created_at", "type": "datetime" }
        ]
    }
})

In [None]:
sample(customers)

## Merge it all together

In [None]:
len(orders)

In [None]:
merged = pd.merge(orders, aggregates, on="order_id", how="inner")
print("merged details: %d" % len(merged))

merged = pd.merge(merged, stores, on="store_id", how="inner")
print("merged stores: %d" % len(merged))

#merged = pd.merge(merged, couriers, on="courier_id")
merged = pd.merge(merged, customers, on="customer_id", how="inner")
print("merged customers: %d" % len(merged))

# add courier information using ad hoc plugin
plugin = s24.plugin.AugmentCouriersPlugin(factory=factory)
merged = plugin.run(merged, action=action)
sample(merged)

merged.set_index("order_id", inplace=True)
merged.sort_values(by='order_id', ascending=True, inplace=True)

len(merged)

In [None]:
sample(merged)
#merged.tail(100)

## Calculate pick, pay and delivery times

In [None]:
def pd_cast_datetime(df, column):
    """ Casts a string column to a date column, assumes format is recognizable """
    df[column] = pd.to_datetime(df[column], infer_datetime_format=True, errors='coerce')

def pd_timediff_min(df, column_start, column_end, column_diff):
    """ Creates column with difference between times in minutes """
    pd_cast_datetime(df, column_start)
    pd_cast_datetime(df, column_end)
    df[column_diff] = df[column_end] - df[column_start]
    df[column_diff] = (df[column_diff].dt.total_seconds() / 60.0)

In [None]:
pd_cast_datetime(merged, "odt_first_touched_at")
pd_cast_datetime(merged, "odt_last_touched_at")
pd_cast_datetime(merged, "order_paid_at")
pd_cast_datetime(merged, "order_delivered_at")
pd_cast_datetime(merged, "order_delivered_at")

# calculate time for picking, paying and delivering order
pd_timediff_min(merged, "odt_first_touched_at", "odt_last_touched_at", "pick_time.min")
pd_timediff_min(merged, "odt_last_touched_at", "order_paid_at", "pay_time.min")
pd_timediff_min(merged, "order_paid_at", "order_delivered_at", "deliver_time.min")

In [None]:
sample(merged)

## Save data, schema and samples

In [None]:
analitico.pandas.pd_to_csv(merged, "data.csv", schema=True, samples=500)

In [None]:
len(merged)

In [None]:
msno.matrix(merged)

In [None]:
merged.dtypes

In [None]:
pd_print_nulls(merged)