# Install libraries

In [None]:
%pip install xgboost==0.81 pandas==0.24.0

In [None]:
%pip install --upgrade google-cloud-core

In [None]:
%pip install --upgrade google-api-core

Restart kernel!

# Get the data in google cloud storage

<p> 
    1. Create a bucket in <a href="https://console.cloud.google.com/storage/">cloud storage</a> to store the raw data in. Remember the name of the bucket.
    
</p>

<p> 
    2. Create a folder named 'instacart' in the bucket
</p>

<p> 
    3. Download the <a href=https://s3.amazonaws.com/instacart-datasets/instacart_online_grocery_shopping_2017_05_01.tar.gz>instacart</a> dataset, unzip it and upload each csv file to the folder created above
</p>

![cloud_storage](img/instacart_cloud_storage.PNG)

# Set up some configs we need

In [31]:
BUCKET = 'avaus-academy-bucket' # The name of the bucket created above
PROJECT = 'avaus-academy' # Your project name here
REGION = 'us-central1'

In [32]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

# Read CSV and put into bigquery

Create a bigquery client to work with

In [3]:
from google.cloud import bigquery
client = bigquery.Client(project=PROJECT)

Create a dataset to place the CSV files in

In [None]:
dataset_name = 'instacart'
dataset_ref = bigquery.dataset.DatasetReference(project=PROJECT, dataset_id=dataset_name)
dataset = bigquery.Dataset(dataset_ref)

#client.delete_dataset(dataset, delete_contents=True)
dataset = client.create_dataset(dataset)

Create a config to load the CSV files with

In [None]:
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
files = [
    'aisles.csv',
    'departments.csv',
    'order_products__prior.csv',
    'order_products__train.csv',
    'orders.csv',
    'products.csv',
]

In [None]:
for file in files:
    # Build input path and destination table
    input_path = "gs://{BUCKET}/instacart/{FILE}".format(BUCKET=BUCKET, FILE=file)
    table_name = file.split('.')[0] # Take the name before '.csv' as the name of the table
    table = dataset.table(table_name)

    # Create a job for loading 
    load_job = client.load_table_from_uri(
        source_uris=input_path, 
        destination=table, 
        job_config=job_config
    )
    print("Starting job for loading {FILE} with id={JOB_ID}".format(FILE=file, JOB_ID=load_job.job_id))

    load_job.result()  # Waits for table load to complete.
    print("Job finished.")

    destination_table = client.get_table(dataset_ref.table(table_name))
    print("Loaded {} rows.".format(destination_table.num_rows))
    print("")
    
print("Finished loading all tables!")

![bigquery](img/instacart_bigquery.PNG)

# Create a dataset for ML

Problem: Given a user and their "latest" order, predict how long it will be until the next order

In [None]:
query="""
    CREATE TABLE instacart.feature_set AS
    WITH 
    user_features AS (
        SELECT
            user_id,
            COUNT(order_id) AS nr_orders,
            SUM(days_since_prior_order) AS user_lifetime,
            COALESCE(COUNT(order_id) / NULLIF(SUM(days_since_prior_order), 0), 1) AS nr_orders_per_day,
            AVG(days_since_prior_order) AS avg_nr_days_between_orders,
            COUNT(CASE WHEN order_dow = 0 THEN order_id END) AS nr_orders_saturday,
            COUNT(CASE WHEN order_dow = 1 THEN order_id END) AS nr_orders_sunday,
            COUNT(CASE WHEN order_dow = 2 THEN order_id END) AS nr_orders_monday,
            COUNT(CASE WHEN order_dow = 3 THEN order_id END) AS nr_orders_tuesday,
            COUNT(CASE WHEN order_dow = 4 THEN order_id END) AS nr_orders_wednesday,
            COUNT(CASE WHEN order_dow = 5 THEN order_id END) AS nr_orders_thursday,
            COUNT(CASE WHEN order_dow = 6 THEN order_id END) AS nr_orders_friday,
            COUNT(CASE WHEN order_hour_of_day BETWEEN 5 AND 11 THEN order_id END) AS nr_orders_morning,
            COUNT(CASE WHEN order_hour_of_day BETWEEN 12 AND 17 THEN order_id END) AS nr_orders_afternoon,
            COUNT(CASE WHEN order_hour_of_day BETWEEN 18 AND 22 THEN order_id END) AS nr_orders_evening,
            COUNT(CASE WHEN order_hour_of_day > 22 OR order_hour_of_day < 5 THEN order_id END) AS nr_orders_night 
        FROM instacart.orders
        WHERE eval_set = 'prior'
        GROUP BY
            user_id
    ),
    last_tx AS (
        SELECT
            user_id,
            eval_set,
            CASE WHEN order_dow = 0 THEN 1 ELSE 0 END AS is_saturday_order,
            CASE WHEN order_dow = 1 THEN 1 ELSE 0 END AS is_sunday_order,
            CASE WHEN order_dow = 2 THEN 1 ELSE 0 END AS is_monday_order,
            CASE WHEN order_dow = 3 THEN 1 ELSE 0 END AS is_tuesday_order,
            CASE WHEN order_dow = 4 THEN 1 ELSE 0 END AS is_wednesday_order,
            CASE WHEN order_dow = 5 THEN 1 ELSE 0 END AS is_thursday_order,
            CASE WHEN order_dow = 6 THEN 1 ELSE 0 END AS is_friday_order,
            CASE WHEN order_hour_of_day BETWEEN 5 AND 11 THEN 1 ELSE 0 END AS is_morning_order,
            CASE WHEN order_hour_of_day BETWEEN 12 AND 17 THEN 1 ELSE 0 END AS is_afternoon_order,
            CASE WHEN order_hour_of_day BETWEEN 18 AND 22 THEN 1 ELSE 0 END AS is_evening_order,
            CASE WHEN order_hour_of_day > 22 OR order_hour_of_day < 5 THEN 1 ELSE 0 END AS is_night_order,
            days_since_prior_order,
            ROW_NUMBER() OVER(PARTITION BY user_id, eval_set ORDER BY order_number DESC) AS order_rank
        FROM instacart.orders
    )
    SELECT
        -- Observation key
        lt.user_id,
        
        -- Features about last order
        lt.is_saturday_order,
        lt.is_sunday_order,
        lt.is_monday_order,
        lt.is_tuesday_order,
        lt.is_wednesday_order,
        lt.is_thursday_order,
        lt.is_friday_order,
        lt.is_morning_order,
        lt.is_afternoon_order,
        lt.is_evening_order,
        lt.is_night_order,
        lt.days_since_prior_order,
        
        -- Features about user
        uf.nr_orders,
        uf.user_lifetime,
        uf.nr_orders_per_day,
        uf.avg_nr_days_between_orders,
        uf.nr_orders_saturday,
        uf.nr_orders_sunday,
        uf.nr_orders_monday,
        uf.nr_orders_tuesday,
        uf.nr_orders_wednesday,
        uf.nr_orders_thursday,
        uf.nr_orders_friday,
        uf.nr_orders_morning,
        uf.nr_orders_afternoon,
        uf.nr_orders_evening,
        uf.nr_orders_night,
        
        -- Target label
        target.days_since_prior_order AS days_to_next_order,
        
        -- Train vs test
        RAND() <= 0.8 AS is_train
    FROM last_tx lt
    INNER JOIN user_features uf ON uf.user_id = lt.user_id
    INNER JOIN last_tx target ON target.user_id = lt.user_id AND target.eval_set = 'train' 
    WHERE lt.eval_set = 'prior' AND lt.order_rank = 1 -- Take last transaction in prior set
"""

In [None]:
# Drop if exists
query_drop = client.query("DROP TABLE IF EXISTS instacart.feature_set")
query_drop.result()
print("Dropped table")

# Create table
query_create = client.query(query)
results = query_create.result()
print("Created table")

![feature_set](img/instacart_feature_set.PNG)

# Train a ML model

In [14]:
# Variables
drop_features = ['user_id']
target_label = 'days_to_next_order'
train_sql = """
SELECT
    *
FROM instacart.feature_set
WHERE is_train
"""

In [15]:
# Read data
client = bigquery.Client(project=PROJECT)
df = client.query(train_sql).to_dataframe()

In [16]:
# Create training dataset
import xgboost as xgb
features = df.drop(drop_features + [target_label], axis=1)
labels = df[target_label]
dtrain = xgb.DMatrix(features, labels)

In [17]:
# train model
bst = xgb.train({}, dtrain, 25)

[13:37:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[13:37:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[13:37:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[13:37:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[13:37:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[13:37:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[13:37:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[13:37:58] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[13:37:5

In [18]:
# Save model
from google.cloud import storage
model_name = 'model.bst'
bst.save_model(model_name)

In [28]:
# Create folder for the model in GCS
storage_client = storage.Client(project=PROJECT)
bucket = storage_client.bucket(bucket_name=BUCKET)

folder = bucket.blob('instacart/model/')
folder.upload_from_string('')

In [29]:
# Upload the model to GCS
blob = bucket.blob('instacart/model/{MODEL_NAME}'.format(MODEL_NAME=model_name))
blob.upload_from_filename(model_name)

![model](img/instacart_model.PNG)

# Deploy model

In [37]:
%%bash
# Variables
MODEL_PATH="gs://$BUCKET/instacart/model"
VERSION_NAME="xgboost_model_v1_0_0"
MODEL_NAME="xgboost_model"
FRAMEWORK="xgboost"

# Create placeholder for model
gcloud ai-platform models create $MODEL_NAME --regions $REGION

# Create a version of the model
gcloud ai-platform versions create $VERSION_NAME \
  --model $MODEL_NAME \
  --origin $MODEL_PATH \
  --runtime-version=1.14 \
  --framework $FRAMEWORK \
  --python-version=3.5

# Check model
gcloud ai-platform versions describe $VERSION_NAME \
  --model $MODEL_NAME

createTime: '2019-09-16T13:42:35Z'
deploymentUri: gs://avaus-academy-bucket/instacart/model
etag: _frc7oW9h0k=
framework: XGBOOST
isDefault: true
machineType: mls1-c1-m2
name: projects/avaus-academy/models/xgboost_model/versions/xgboost_model_v1_0_0
pythonVersion: '3.5'
runtimeVersion: '1.14'
state: READY


ERROR: (gcloud.ai-platform.models.create) Resource in project [avaus-academy] is the subject of a conflict: Field: model.name Error: A model with the same name already exists.
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: A model with the same name already exists.
    field: model.name
ERROR: (gcloud.ai-platform.versions.create) ALREADY_EXISTS: Field: version.name Error: A version with the same name already exists.
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: A version with the same name already exists.
    field: version.name


![model](img/instacart_ai_platform.PNG)

# Predict using API

In [63]:
from oauth2client.client import GoogleCredentials
import requests
import json

MODEL_NAME = 'xgboost_model'
MODEL_VERSION = 'xgboost_model_v1_0_0'

token = GoogleCredentials.get_application_default().get_access_token().access_token
api = "https://ml.googleapis.com/v1/projects/{PROJECT}/models/{MODEL_NAME}/versions/{MODEL_VERSION}:predict"\
    .format(PROJECT=PROJECT, MODEL_NAME=MODEL_NAME, MODEL_VERSION=MODEL_VERSION)

headers = {'Authorization': 'Bearer ' + token }
data = {
  'instances': [
      # "days_to_next_order": 8.0
      [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 8.0, 20, 126.0, 0.15873015873015872, 6.631578947368422, 2, 1, 1, 2, 6, 3, 5, 6, 14, 0, 0],
      
      # "days_to_next_order": 18.0
      [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 30.0, 3, 60.0, 0.05, 30.0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0]
  ]
}
response = requests.post(api, json=data, headers=headers)
print(response.content)

b'{"predictions": [9.828922271728516, 24.23944854736328]}'
