## Druid 26.0 release notebook

## Mixing code and content

In [None]:
# What's the current version of Druid?
import requests

druid_host = "http://localhost:8888"
session = requests.Session()
endpoint = druid_host + '/status'
response = session.get(endpoint)
json = response.json()
print("Running on Druid version: "+ json["version"])

## Schema auto-discovery

### What would happen in the past if we just load this data?


In [None]:
# Sample file /Users/will.xu/projects/data_gen
import pandas as pd
df = pd.read_parquet('/Users/will.xu/projects/data_gen/products.parquet.gzip')
df.head()

#### "unit_price" would be loaded as a string, and this makes things like SUM(unit_price) extra slow

In [None]:
import json
from IPython.display import JSON
with open('ingest_01.json','r') as f:  
    json_data = json.load(f)

JSON(json_data,expanded=True)

#endpoint = druid_host + '/druid/indexer/v1/task/'
#response = session.post(endpoint,json = json_data)

In [None]:
import pandas as pd
endpoint = druid_host + '/druid/v2/sql'
sql = '''
SELECT *
FROM "INFORMATION_SCHEMA"."COLUMNS"
WHERE  "TABLE_NAME" = 'product_test'
'''
sql_request = {'query': sql}
json_data = session.post(endpoint, json=sql_request).json()
result_df = pd.json_normalize(json_data)
result_df.head()

## String dictionary front-coded compression

How to make Druid data size substentially smaller

In [None]:
data_size = {"auto": 4.75, "auto with front-coded": 4.13, "fixed":3.75, "fixed with front-coded":3.35}
method = list(data_size.keys())
values = list(data_size.values())
import matplotlib.pyplot as plt
plt.bar(method, values, data = data_size, color = "lightblue")
plt.xlabel("Methods")
plt.ylabel("Data size (GB)")
plt.title("Dictionary compression vs. various formats")
plt.ylim([0, 5])


plt.show()

# Shuffle join

### Make it really easy to denormalize data as part of ingestion

In [None]:
# Sample file /Users/will.xu/projects/data_gen
import pandas as pd
df = pd.read_parquet('/Users/will.xu/projects/data_gen/users.parquet.gzip')
df.head()

In [None]:
# Sample file /Users/will.xu/projects/data_gen
import pandas as pd
df = pd.read_parquet('/Users/will.xu/projects/data_gen/transaction.parquet.gzip')
df.head()

#### Before the support of shuffle join, you'll need to use another tool to prepare the data then ingest into Druid
#### With shuffle join support, you'll just need to run 1 query!

In [None]:
query = '''REPLACE INTO "order_transaction3" OVERWRITE ALL
WITH "users" AS (SELECT *
FROM TABLE(
  EXTERN(
    '{"type":"local","baseDir":"/Users/will.xu/projects/data_gen/users.parquet.gzip","filter":"*"}',
    '{"type":"parquet"}',
    '[{"name":"user_id","type":"long"},{"name":"name","type":"string"},{"name":"email","type":"string"},{"name":"username","type":"string"}]'
  )
)),
"transactions" AS (SELECT *
FROM TABLE(
  EXTERN(
    '{"type":"local","baseDir":"/Users/will.xu/projects/data_gen/transaction.parquet.gzip","filter":"*"}',
    '{"type":"parquet"}',
    '[{"name":"quantity","type":"long"},{"name":"total_price","type":"double"},{"name":"user_id","type":"long"},{"name":"product_id","type":"long"},{"name":"unit_price","type":"double"},{"name":"timestamp","type":"long"}]'
  )
)),
"products" AS (SELECT *
FROM TABLE(
  EXTERN(
    '{"type":"local","baseDir":"/Users/will.xu/projects/data_gen/products.parquet.gzip","filter":"*"}',
    '{"type":"parquet"}',
    '[{"name":"unit_price","type":"double"},{"name":"product_name","type":"string"},{"name":"prod_id","type":"long"}]'
  )
))

SELECT
  MILLIS_TO_TIMESTAMP("timestamp" * 1000) AS "__time",
  "products"."product_name",
  "users"."name",
  "users"."email",
  "quantity",
  "total_price",
  "transactions"."user_id",
  "product_id",
  "products"."unit_price"
FROM "transactions"
LEFT JOIN "products" ON
"products"."prod_id" = "transactions"."product_id"
LEFT JOIN "users" ON
"users"."user_id" = "transactions"."user_id"
PARTITIONED BY DAY
'''

### Let's watch the ingestion task running...

In [None]:
sql_request={'query': query}
endpoint = druid_host + '/druid/v2/sql/task'
response = session.post(endpoint, json=sql_request)

In [None]:
ingestion_taskId = response.json()['taskId']
endpoint = druid_host + f"/druid/indexer/v1/task/{ingestion_taskId}/status"
import time

json = session.get(endpoint).json()
ingestion_status = json['status']['status']
 
if ingestion_status == "RUNNING":
    print("The ingestion is running...")

while ingestion_status != "SUCCESS":
    time.sleep(1)
    json = session.get(endpoint).json()
    ingestion_status = json['status']['status']
    print('.', end='')

if ingestion_status == "SUCCESS": 
    print("\nThe ingestion is complete")
else:
    print("\nThe ingestion task failed:", json)


### Note I didn't use any other tools, this is all done within Druid. No need for using Spark/Presto for data prep

## UNNEST and Arrays

In [None]:
import pandas as pd
endpoint = druid_host + '/druid/v2/sql'
sql = '''
SELECT 'post_id_123' AS "POST_ID", ARRAY['almond','blue_berry','muffin'] as "Tags"
'''
sql_request = {'query': sql}
json_data = session.post(endpoint, json=sql_request).json()
result_df = pd.json_normalize(json_data)
result_df.head()

In [None]:
import pandas as pd
endpoint = druid_host + '/druid/v2/sql'
sql = '''SELECT 'post_id_123' as "POST_ID", * FROM UNNEST(ARRAY['almond','blue_berry','muffin']) 
'''
sql_request = {'query': sql, 'context':{'enableUnnest': 'true'}}
json_data = session.post(endpoint, json=sql_request).json()
JSON(json_data)
result_df = pd.json_normalize(json_data)
result_df.head()