## Environment Setup
Connecting to backend execution cores:

In [1]:
import ipyparallel as ipp
c = ipp.Client(profile='mpi')
view = c[:]
view.activate()

## TPCxBB_Q26 Example Code
Retail analytics example from [TPCxBB](http://www.tpc.org/tpcx-bb/default.asp): cluster customers into book buddies/club groups based on their in store book purchasing histories.

In [2]:
%%px --block
import numpy as np
import pandas as pd
import daal4py
daal4py.daalinit(1); daal4py.my_procid()
import bodo
import sys
import time


@bodo.jit(distributed={'A'})
def q26(ss_file, i_file, category, item_count):
    t1 = time.time()
    ss_dtype = {'ss_item_sk': np.int64, 'ss_customer_sk': np.int64}
    store_sales = pd.read_csv(ss_file, sep='|', usecols=[2,3],
        names=ss_dtype.keys(), dtype=ss_dtype)

    i_dtype = {
        'i_item_sk': np.int64, 'i_class_id': np.int32, 'i_category': str}
    item = pd.read_csv(i_file, sep='|', usecols=[0, 9, 12],
        names=i_dtype.keys(), dtype=i_dtype)

    item2 = item[item['i_category']==category]
    sale_items = pd.merge(
        store_sales, item2, left_on='ss_item_sk', right_on='i_item_sk')

    count1 = sale_items.groupby('ss_customer_sk')['ss_item_sk'].count()
    gp1 = sale_items.groupby('ss_customer_sk')['i_class_id']

    def id1(x): return (x==1).sum()
    def id2(x): return (x==2).sum()
    def id3(x): return (x==3).sum()
    def id4(x): return (x==4).sum()
    def id5(x): return (x==5).sum()
    def id6(x): return (x==6).sum()
    def id7(x): return (x==7).sum()
    def id8(x): return (x==8).sum()
    def id9(x): return (x==9).sum()
    def id10(x): return (x==10).sum()
    def id11(x): return (x==11).sum()
    def id12(x): return (x==12).sum()
    def id13(x): return (x==13).sum()
    def id14(x): return (x==14).sum()
    def id15(x): return (x==15).sum()

    customer_i_class = gp1.agg((id1, id2, id3, id4, id5, id6, id7, id8, id9,
        id10, id11, id12, id13, id14, id15))

    customer_i_class['ss_item_count'] = count1

    customer_i_class = customer_i_class[
        customer_i_class.ss_item_count > item_count]
    A = customer_i_class.values.astype(np.float64)
#     print("checksum", res)
    print("exec time", time.time()-t1)
    return A

In [3]:
%%px --block
ss_file = "/Users/ehsan/dev/sw/data/store_sales_10.dat"
i_file =  "/Users/ehsan/dev/sw/data/item_10.dat"
q26_i_category_IN = 'Books'
q26_count_ss_item_sk = 5
A = q26(ss_file, i_file, q26_i_category_IN, q26_count_ss_item_sk)

[stdout:0] exec time 8.02197103598155


In [4]:
%%px --block
num_centroids = 5
centroids = daal4py.kmeans_init(
    num_centroids, method='plusPlusDense', distributed=True).compute(
        A).centroids
model = daal4py.kmeans(num_centroids, 30, distributed=True).compute(
    A, centroids).centroids
daal4py.daalfini()

KeyboardInterrupt: 

## Python Version
The same computation in regular Python takes a long time, especially since user-defined aggregate functions are used.

In [9]:
import bodo
import numpy as np
import pandas as pd
import sys
import time


def q26(ss_file, i_file, category, item_count):
    t1 = time.time()
    ss_dtype = {'ss_item_sk': np.int64, 'ss_customer_sk': np.int64}
    store_sales = pd.read_csv(ss_file, sep='|', usecols=[2,3],
        names=ss_dtype.keys(), dtype=ss_dtype)

    i_dtype = {
        'i_item_sk': np.int64, 'i_class_id': np.int32, 'i_category': str}
    item = pd.read_csv(i_file, sep='|', usecols=[0, 9, 12],
        names=i_dtype.keys(), dtype=i_dtype)

    item2 = item[item['i_category']==category]
    sale_items = pd.merge(
        store_sales, item2, left_on='ss_item_sk', right_on='i_item_sk')

    count1 = sale_items.groupby('ss_customer_sk')['ss_item_sk'].count()
    gp1 = sale_items.groupby('ss_customer_sk')['i_class_id']

    def id1(x): return (x==1).sum()
    def id2(x): return (x==2).sum()
    def id3(x): return (x==3).sum()
    def id4(x): return (x==4).sum()
    def id5(x): return (x==5).sum()
    def id6(x): return (x==6).sum()
    def id7(x): return (x==7).sum()
    def id8(x): return (x==8).sum()
    def id9(x): return (x==9).sum()
    def id10(x): return (x==10).sum()
    def id11(x): return (x==11).sum()
    def id12(x): return (x==12).sum()
    def id13(x): return (x==13).sum()
    def id14(x): return (x==14).sum()
    def id15(x): return (x==15).sum()

    customer_i_class = gp1.agg((id1, id2, id3, id4, id5, id6, id7, id8, id9,
        id10, id11, id12, id13, id14, id15))

    customer_i_class['ss_item_count'] = count1

    customer_i_class = customer_i_class[
        customer_i_class.ss_item_count > item_count]
    res = customer_i_class.values.astype(np.float64).sum()
    print("checksum", res)
    print("exec time", time.time()-t1)

In [10]:
ss_file = "/Users/ehsan/dev/sw/data/store_sales_10.dat"
i_file =  "/Users/ehsan/dev/sw/data/item_10.dat"
q26_i_category_IN = 'Books'
q26_count_ss_item_sk = 5
q26(ss_file, i_file, q26_i_category_IN, q26_count_ss_item_sk)

checksum 3380326.0
exec time 715.5162229537964


This computation takes 715.5 seconds on Python but only 3.27 seconds on Bodo (220x speedup on 8 cores)

## Spark/Scala/SQL Version
The Spark/Scala/SQL version of the same code is much more complicated, and is also much slower.

In [None]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import scala.language.existentials

import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.joins._

import org.apache.spark.ml.clustering.{KMeansModel, KMeans}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import scala.language.reflectiveCalls
import java.lang.management.ManagementFactory
import scala.collection.JavaConversions._

object Query26 {
  def main(args: Array[String]) {
    // Starting time
    val t0 = System.currentTimeMillis
    val spark = SparkSession
      .builder()
      .appName("Q26")
      .config("spark.sql.autoBroadcastJoinThreshold", "-1")
      .getOrCreate()

    import spark.implicits._
    val table_store_sales_path = args(0)
    val table_item_path = args(1)

    val schema_store_sales = StructType(Array(
      StructField("ss_sold_date_sk", LongType, true),
      StructField("ss_sold_time_sk", LongType, true),
      StructField("ss_item_sk", LongType, true),
      StructField("ss_customer_sk", LongType, true),
      StructField("ss_cdemo_sk", LongType, true),
      StructField("ss_hdemo_sk", LongType, true),
      StructField("ss_addr_sk", LongType, true),
      StructField("ss_store_sk", LongType, true),
      StructField("ss_promo_sk", LongType, true),
      StructField("ss_ticket_number", LongType, true),
      StructField("ss_quantity", IntegerType, true),
      StructField("ss_wholesale_cost", FloatType, true),
      StructField("ss_list_price", FloatType, true),
      StructField("ss_sales_price", FloatType, true),
      StructField("ss_ext_discount_amt", FloatType, true),
      StructField("ss_ext_sales_price", FloatType, true),
      StructField("ss_ext_wholesale_cost", FloatType, true),
      StructField("ss_ext_list_price", FloatType, true),
      StructField("ss_ext_tax", FloatType, true),
      StructField("ss_coupon_amt", FloatType, true),
      StructField("ss_net_paid", FloatType, true),
      StructField("ss_net_paid_inc_tax", FloatType, true),
      StructField("ss_net_profit", FloatType, true)
      ))
    val df_store_sales = spark.read.schema(schema_store_sales).format("csv").option("sep", "|").load(table_store_sales_path)

    val schema_item = StructType(Array(
      StructField("i_item_sk", LongType, true),
      StructField("i_item_id", StringType, true),
      StructField("i_rec_start_date", StringType, true),
      StructField("i_rec_end_date", StringType, true),
      StructField("i_item_desc", StringType, true),
      StructField("i_current_price", FloatType, true),
      StructField("i_wholesale_cost", FloatType, true),
      StructField("i_brand_id", IntegerType, true),
      StructField("i_brand", StringType, true),
      StructField("i_class_id", IntegerType, true),
      StructField("i_class", StringType, true),
      StructField("i_category_id", IntegerType, true),
      StructField("i_category", StringType, true),
      StructField("i_manufact_id", IntegerType, true),
      StructField("i_manufact", StringType, true),
      StructField("i_size", StringType, true),
      StructField("i_formulation", StringType, true),
      StructField("i_color", StringType, true),
      StructField("i_units", StringType, true),
      StructField("i_container", StringType, true),
      StructField("i_manager_id", IntegerType, true),
      StructField("i_product_name", StringType, true)
      ))

    val df_item = spark.read.schema(schema_item).format("csv").option("sep", "|").load(table_item_path)
    // val df_item = spark.read.parquet(table_item_path)

    df_store_sales.registerTempTable("store_sales_table")
    df_item.registerTempTable("item_table")
    // collect() fails so using first()
    // df_store_sales.cache().first()
    // df_item.cache().first()

    val fin  = spark.sql("""
    SELECT
  ss.ss_customer_sk AS cid,
  count(CASE WHEN i.i_class_id=1  THEN 1 ELSE NULL END) AS id1,
  count(CASE WHEN i.i_class_id=2  THEN 1 ELSE NULL END) AS id2,
  count(CASE WHEN i.i_class_id=3  THEN 1 ELSE NULL END) AS id3,
  count(CASE WHEN i.i_class_id=4  THEN 1 ELSE NULL END) AS id4,
  count(CASE WHEN i.i_class_id=5  THEN 1 ELSE NULL END) AS id5,
  count(CASE WHEN i.i_class_id=6  THEN 1 ELSE NULL END) AS id6,
  count(CASE WHEN i.i_class_id=7  THEN 1 ELSE NULL END) AS id7,
  count(CASE WHEN i.i_class_id=8  THEN 1 ELSE NULL END) AS id8,
  count(CASE WHEN i.i_class_id=9  THEN 1 ELSE NULL END) AS id9,
  count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS id10,
  count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS id11,
  count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS id12,
  count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS id13,
  count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS id14,
  count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS id15
FROM store_sales_table ss
INNER JOIN item_table i
  ON (ss.ss_item_sk = i.i_item_sk AND i.i_category = "Books"
  AND ss.ss_customer_sk IS NOT NULL
)
GROUP BY ss.ss_customer_sk
HAVING count(ss.ss_item_sk) > 5
    """)
    val assembler = new VectorAssembler().setInputCols(
      Array("id1", "id2", "id3","id4","id5","id6","id7",
      "id8","id9","id10","id11","id12","id13","id14","id15")).setOutputCol("features")
    val ds = assembler.transform(fin)
    ds.cache.first
    val t1 = System.currentTimeMillis

    // Measure time
    println("Query 26 time(s) took: " + (t1 - t0).toFloat / 1000)
  }
}
