## Druid 26.0 release notebook

You'll need the following dependencies:

pandas, requests

In [11]:
# What's the current version of Druid?
import requests

druid_host = "http://localhost:8888"
session = requests.Session()
endpoint = druid_host + '/status'
response = session.get(endpoint)
json = response.json()
print("Running on Druid version: "+ json["version"])

Running on Druid version: 26.0.0


## Schema auto-discovery

### What would happen in the past if we just load this data?


In [24]:
import json
from IPython.display import JSON
ingestion_spec = {
  "type": "index_parallel",
  "spec": {
    "ioConfig": {
      "type": "index_parallel",
      "inputSource": {
        "type": "http",
        "uris": ["https://druid.apache.org/data/wikipedia.json.gz"],
        "filter": "*"
      },
      "inputFormat": {
        "type": "json"
      }
    },
    "tuningConfig": {
      "type": "index_parallel",
      "partitionsSpec": {
        "type": "dynamic"
      },
      "indexSpec": {
        "stringDictionaryEncoding": {
          "type": "frontCoded",
          "bucketSize": 16
        }
      }
    },
    "dataSchema": {
      "dataSource": "wikipedia",
      "timestampSpec": {
        "missingValue": "2010-01-01T00:00:00Z"
      },
      "dimensionsSpec": {
        "dimensions": [],
        "dimensionExclusions": [],
        "spatialDimensions": [],
        "useSchemaDiscovery": True
      },
      "granularitySpec": {
        "queryGranularity": "none",
        "rollup": False
      }
    }
  }
}

JSON(ingestion_spec,expanded=True)

endpoint = druid_host + '/druid/indexer/v1/task/'
response = session.post(endpoint,json = ingestion_spec)


Note that because we've set `"useSchemaDiscovery": True` in the ingestion spec, even though we didn't specify any data types for the columns, they are correctly inferred. Look at the code example below:

In [28]:
import pandas as pd
endpoint = druid_host + '/druid/v2/sql'
sql = '''
SELECT *
FROM "INFORMATION_SCHEMA"."COLUMNS"
WHERE  "TABLE_NAME" = 'wikipedia'
'''
sql_request = {'query': sql}
json_data = session.post(endpoint, json=sql_request).json()
result_df = pd.json_normalize(json_data)
result_df.head()

Unnamed: 0,TABLE_CATALOG,TABLE_SCHEMA,TABLE_NAME,COLUMN_NAME,ORDINAL_POSITION,COLUMN_DEFAULT,IS_NULLABLE,DATA_TYPE,CHARACTER_MAXIMUM_LENGTH,CHARACTER_OCTET_LENGTH,NUMERIC_PRECISION,NUMERIC_PRECISION_RADIX,NUMERIC_SCALE,DATETIME_PRECISION,CHARACTER_SET_NAME,COLLATION_NAME,JDBC_TYPE
0,druid,druid,wikipedia,__time,1,,NO,TIMESTAMP,,,,,,3.0,,,93
1,druid,druid,wikipedia,isRobot,2,,NO,BIGINT,,,19.0,10.0,0.0,,,,-5
2,druid,druid,wikipedia,channel,3,,YES,VARCHAR,,,,,,,UTF-16LE,UTF-16LE$en_US$primary,12
3,druid,druid,wikipedia,flags,4,,YES,VARCHAR,,,,,,,UTF-16LE,UTF-16LE$en_US$primary,12
4,druid,druid,wikipedia,isUnpatrolled,5,,NO,BIGINT,,,19.0,10.0,0.0,,,,-5


As you can see, in `DATA_TYPE` column, different data types are correctly detected and not everything are stored as `strings`.

# Shuffle join

### Make it really easy to denormalize data as part of ingestion
Before the support of shuffle join, you'll need to use another tool to prepare the data then ingest into Druid. With shuffle join support, you can do the same transformation with one query.

In [None]:
query = '''REPLACE INTO "order_transaction3" OVERWRITE ALL
WITH "users" AS (SELECT *
FROM TABLE(
  EXTERN(
    '{"type":"http","uris":"/Users/will.xu/projects/data_gen/users.parquet.gzip","filter":"*"}',
    '{"type":"parquet"}',
    '[{"name":"user_id","type":"long"},{"name":"name","type":"string"},{"name":"email","type":"string"},{"name":"username","type":"string"}]'
  )
)),
"transactions" AS (SELECT *
FROM TABLE(
  EXTERN(
    '{"type":"local","baseDir":"/Users/will.xu/projects/data_gen/transaction.parquet.gzip","filter":"*"}',
    '{"type":"parquet"}',
    '[{"name":"quantity","type":"long"},{"name":"total_price","type":"double"},{"name":"user_id","type":"long"},{"name":"product_id","type":"long"},{"name":"unit_price","type":"double"},{"name":"timestamp","type":"long"}]'
  )
)),
"products" AS (SELECT *
FROM TABLE(
  EXTERN(
    '{"type":"local","baseDir":"/Users/will.xu/projects/data_gen/products.parquet.gzip","filter":"*"}',
    '{"type":"parquet"}',
    '[{"name":"unit_price","type":"double"},{"name":"product_name","type":"string"},{"name":"prod_id","type":"long"}]'
  )
))

SELECT
  MILLIS_TO_TIMESTAMP("timestamp" * 1000) AS "__time",
  "products"."product_name",
  "users"."name",
  "users"."email",
  "quantity",
  "total_price",
  "transactions"."user_id",
  "product_id",
  "products"."unit_price"
FROM "transactions"
LEFT JOIN "products" ON
"products"."prod_id" = "transactions"."product_id"
LEFT JOIN "users" ON
"users"."user_id" = "transactions"."user_id"
PARTITIONED BY DAY
'''

### Let's watch the ingestion task running...

In [32]:
sql_request={'query': query}
endpoint = druid_host + '/druid/v2/sql/task'
response = session.post(endpoint, json=sql_request)

NameError: name 'query' is not defined

In [33]:
ingestion_taskId = response.json()['taskId']
endpoint = druid_host + f"/druid/indexer/v1/task/{ingestion_taskId}/status"
import time

json = session.get(endpoint).json()
ingestion_status = json['status']['status']
 
if ingestion_status == "RUNNING":
    print("The ingestion is running...")

while ingestion_status != "SUCCESS":
    time.sleep(1)
    json = session.get(endpoint).json()
    ingestion_status = json['status']['status']
    print('.', end='')

if ingestion_status == "SUCCESS": 
    print("\nThe ingestion is complete")
else:
    print("\nThe ingestion task failed:", json)


KeyError: 'taskId'

### Note I didn't use any other tools, this is all done within Druid. No need for using Spark/Presto for data prep

## UNNEST and Arrays

UNNEST is useful to deal with Array data and allows you to "explode" an array into individual rows

In [30]:
import pandas as pd
endpoint = druid_host + '/druid/v2/sql'
sql = '''
SELECT 'post_id_123' AS "POST_ID", ARRAY['almond','blue_berry','muffin'] as "Tags"
'''
sql_request = {'query': sql}
json_data = session.post(endpoint, json=sql_request).json()
result_df = pd.json_normalize(json_data)
result_df.head()

Unnamed: 0,POST_ID,Tags
0,post_id_123,"[""almond"",""blue_berry"",""muffin""]"


In [31]:
import pandas as pd
endpoint = druid_host + '/druid/v2/sql'
sql = '''SELECT 'post_id_123' as "POST_ID", * FROM UNNEST(ARRAY['almond','blue_berry','muffin']) 
'''
sql_request = {'query': sql, 'context':{'enableUnnest': 'true'}}
json_data = session.post(endpoint, json=sql_request).json()
JSON(json_data)
result_df = pd.json_normalize(json_data)
result_df.head()

Unnamed: 0,POST_ID,EXPR$0
0,post_id_123,almond
1,post_id_123,blue_berry
2,post_id_123,muffin
