## Retail analytics using TPCxBB Q26
Retail analytics example from [TPCxBB](http://www.tpc.org/tpcx-bb): In this example, customers are clustered into book buddies/club groups based on their in store book purchasing histories using Bodo's Pandas support. For information on acquiring the dataset, see the [TPC website](https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).

In [2]:
import numpy as np
import pandas as pd
import bodo
import time

@bodo.jit
def q26_bodo(ss_file, i_file, category, item_count):
    t1 = time.time()
    ss_dtype = {"ss_item_sk": np.int64, "ss_customer_sk": np.int64}
    store_sales = pd.read_csv(
        ss_file, sep="|", usecols=[2, 3], names=ss_dtype.keys(), dtype=ss_dtype
    )

    i_dtype = {"i_item_sk": np.int64, "i_class_id": np.int32, "i_category": str}
    item = pd.read_csv(
        i_file, sep="|", usecols=[0, 9, 12], names=i_dtype.keys(), dtype=i_dtype
    )

    item2 = item[item['i_category']==category]
    sale_items = pd.merge(
        store_sales, item2, left_on='ss_item_sk', right_on='i_item_sk')

    count1 = sale_items.groupby('ss_customer_sk')['ss_item_sk'].count()
    gp1 = sale_items.groupby('ss_customer_sk')['i_class_id']

    def id1(x): return (x==1).sum()
    def id2(x): return (x==2).sum()
    def id3(x): return (x==3).sum()
    def id4(x): return (x==4).sum()
    def id5(x): return (x==5).sum()
    def id6(x): return (x==6).sum()
    def id7(x): return (x==7).sum()
    def id8(x): return (x==8).sum()
    def id9(x): return (x==9).sum()
    def id10(x): return (x==10).sum()
    def id11(x): return (x==11).sum()
    def id12(x): return (x==12).sum()
    def id13(x): return (x==13).sum()
    def id14(x): return (x==14).sum()
    def id15(x): return (x==15).sum()

    customer_i_class = gp1.agg((id1, id2, id3, id4, id5, id6, id7, id8, id9,
        id10, id11, id12, id13, id14, id15))

    customer_i_class['ss_item_count'] = count1

    customer_i_class = customer_i_class[
        customer_i_class.ss_item_count > item_count]
    res = customer_i_class.values.astype(np.float64).sum()
    print("checksum", res)
    print("Exec time", time.time()-t1)

In [3]:
store_sales_file = "s3://bodo-example-data/tpcxbb/SF10/store_sales/store_sales_100.dat"
item_file = "s3://bodo-example-data/tpcxbb/SF10/item/item_100.dat"
q26_i_category_IN = 'Books'
q26_count_ss_item_sk = 5
q26_bodo(store_sales_file, item_file, q26_i_category_IN, q26_count_ss_item_sk)

checksum 0.0
Exec time 2.42650599999979
