### multi source
* single subquery, multi source joining done in combiner
* multi source joining and aggregation done in combiner
* joining tables from 3 sources
* union of multi source & aggregation

In [1]:
import sys, logging
sys.path.append('../')
from hybrid_engine import HybridEngine

qe = HybridEngine(
                postgres= {'server': 'localhost', 'port': 5432, 'database': 'SQLBook', 'user': 'postgres', 'password': ''},
                asterix= {'server': 'localhost', 'port': 19002, 'dataverse': 'TinySocial'},
                solr= {'server': 'localhost', 'port': 8983, 'core': 'bookstore'})
#qe = HybridEngine()

In [4]:
# single subquery, multi source joining done in combiner
datalog = '''
Ans(pid, nunits, date, asin, category) <-
  postgres.orders(oid, _, _, _, _, _, _, _, _, _, _),
  postgres.orderlines(olid, oid, pid, _, date, _, nunits, _),
  postgres.products(pid, _, _, _, _, _, asin, nodeid),
  asterix.categoryflat(nodeid, category),
  pid > 1000, date > '2015-01-01', category = 'Education;Children & Teens',
  limit 100
'''
result = qe.queryDatalogRaw(datalog, loglevel=logging.DEBUG)
print(result[:5])
result.shape

<qe.HybridEngine> query datalog:
[{'condition': ['pid > 1000',
                " date > '2015-01-01'",
                " category = 'Education;Children & Teens'"],
  'limit': '100',
  'result': 'Ans(pid,nunits,date,asin,category)',
  'table': ['postgres.orders(oid, _, _, _, _, _, _, _, _, _, _)',
            'postgres.orderlines(olid, oid, pid, _, date, _, nunits, _)',
            'postgres.products(pid, _, _, _, _, _, asin, nodeid)',
            'asterix.categoryflat(nodeid, category)']}]

<qe.DatalogParser> parser structure:
{'_result': 'Ans(pid,nunits,date,asin,category)',
 'aggregation': {},
 'column_to_table': {'_': {'postgres': 'products'},
                     'asin': {'postgres': 'products'},
                     'category': {'asterix': 'categoryflat'},
                     'date': {'postgres': 'orderlines'},
                     'nodeid': {'asterix': 'categoryflat',
                                'postgres': 'products'},
                     'nunits': {'postgres': 'orderlines

<qe.AsterixEngine> query sample result:
                                            category      nodeid
0  [Christian Books & Bibles, Education, Children...  7288554011
1  [Religion & Spirituality, Christian Books & Bi...  7288554011
(2, 2)
<qe.HybridEngine> arbiter submit query to postgres
{'column_idx': {'orderlines': {'_': 7,
                               'date': 4,
                               'nunits': 6,
                               'oid': 1,
                               'olid': 0,
                               'pid': 2},
                'orders': {'_': 10, 'oid': 0},
                'products': {'_': 5, 'asin': 6, 'nodeid': 7, 'pid': 0}},
 'columns': {'categoryflat': ['nodeid', 'category'],
             'orderlines': ['olid',
                            'oid',
                            'pid',
                            '_',
                            'date',
                            '_',
                            'nunits',
                            '_'],
    

     pid  nunits        date        asin  \
0  11754       1  2015-10-20  0006499791   
1  11754       1  2015-10-23  0006499791   
2  11754       0  2016-07-12  0006499791   
3  11754       1  2015-01-21  0006499791   
4  11754       1  2016-04-23  0006499791   

                                            category  
0  [Christian Books & Bibles, Education, Children...  
1  [Christian Books & Bibles, Education, Children...  
2  [Christian Books & Bibles, Education, Children...  
3  [Christian Books & Bibles, Education, Children...  
4  [Christian Books & Bibles, Education, Children...  


(10, 5)

In [5]:
# multi source aggregation (aggregation done in combiner)
datalog = '''
product_order_view(pid, oid, lvl1, price) <-
  postgres.orders(oid, _, _, _, _, _, _, _, _, _, _),
  postgres.orderlines(olid, oid, pid, _, date, _, nunits, price),
  postgres.products(pid, _, _, _, _, _, asin, nodeid),
  asterix.categorylevel(nodeid, lvl1, _, _, _, _)
ans(lvl1, total_order, total_value) <-
  setof({pid}, {oid}, {price}, view.product_order_view(pid, oid, lvl1, price), S), count(oid, total_order), sum(price, total_value)
'''
result = qe.queryDatalogRaw(datalog, loglevel=logging.DEBUG)
print(result[:5])
result.shape

<qe.HybridEngine> query datalog:
[{'condition': [],
  'groupby': {'aggregation': ['count(oid,total_order)',
                              'sum(price,total_value)'],
              'key': 'lvl1'},
  'result': 'ans(lvl1,total_order,total_value)',
  'table': ['postgres.orders(oid, _, _, _, _, _, _, _, _, _, _)',
            'postgres.orderlines(olid, oid, pid, _, date, _, nunits, price)',
            'postgres.products(pid, _, _, _, _, _, asin, nodeid)',
            'asterix.categorylevel(nodeid, lvl1, _, _, _, _)']}]

<qe.DatalogParser> parser structure:
{'_result': 'ans(lvl1,total_order,total_value)',
 'aggregation': {'total_order': ('count', 'oid'),
                 'total_value': ('sum', 'price')},
 'column_to_table': {'_': {'asterix': 'categorylevel', 'postgres': 'products'},
                     'asin': {'postgres': 'products'},
                     'date': {'postgres': 'orderlines'},
                     'lvl1': {'asterix': 'categorylevel'},
                     'nodeid': {'asterix'

<qe.PostgresEngine> query sql cmd:
SELECT orderlines.orderid as oid, orderlines.totalprice as price, products.nodeid
FROM orders, orderlines, products
WHERE orderlines.productid=products.productid AND orders.orderid=orderlines.orderid

<qe.PostgresEngine> query sample result:
       oid   price      nodeid
0  1008493   $0.00  8883853011
1  1010191  $10.00        2946
2  1008576  $22.50  8944264011
3  1010227   $7.50      882340
4  1009124  $22.50  8944264011
(286017, 3)
<qe.Combiner> combining result for single subquery
<qe.Combiner> join result on:
{'nodeid': ['asterix.categorylevel', 'postgres.products']}

<qe.Combiner> group and aggregate result:
{'column': 'lvl1', 'source': 'asterix', 'table': 'categorylevel'}
{'total_order': ('count', 'oid'), 'total_value': ('sum', 'price')}



                       lvl1  total_order  total_value
0        Arts & Photography        20188  220576901.0
1     Biographies & Memoirs         2279   57533260.0
2          Business & Money         6742  191967915.0
3          Children's Books       167789  563631783.0
4  Christian Books & Bibles         3207  109730203.0


(29, 3)

In [3]:
# union of single source
datalog = '''
Ans(pid, nodeid) <-
  postgres.products(pid, _, _, _, _, _, asin, nodeid),
  pid > 10000, pid < 10020
Ans(pid, nodeid) <-
  postgres.products(pid, _, _, _, _, _, asin, nodeid),
  pid > 10015, pid < 10025
'''
result = qe.queryDatalogRaw(datalog, loglevel=logging.INFO)
result

<qe.HybridEngine> query datalog:
[{'condition': ['pid > 10000', ' pid < 10020'],
  'result': 'Ans(pid,nodeid)',
  'table': ['postgres.products(pid, _, _, _, _, _, asin, nodeid)']},
 {'condition': ['pid > 10015', ' pid < 10025'],
  'result': 'Ans(pid,nodeid)',
  'table': ['postgres.products(pid, _, _, _, _, _, asin, nodeid)']}]

<qe.PostgresEngine> query sql cmd:
SELECT products.productid as pid, products.nodeid
FROM products
WHERE products.productid < '10020'

<qe.PostgresEngine> query sql cmd:
SELECT products.productid as pid, products.nodeid
FROM products
WHERE products.productid < '10025'

<qe.Combiner> union 2 dataframes with shape: [(19, 2), (24, 2)], output shape: (24, 2)



Unnamed: 0,pid,nodeid
0,10001,1
1,10002,173508
2,10003,266162
3,10004,720870
4,10005,3564986011
5,10006,882340
6,10007,1007
7,10008,1002
8,10009,4539344011
9,10010,266160


In [2]:
# joining tables from 3 sources
# query on view, done in combiner
datalog = '''
product_cat_view(pid, cat, nunits, price, length) <-
  postgres.orderlines(olid, oid, pid, _, date, _, nunits, price),
  postgres.products(pid, _, _, _, _, _, asin, nodeid),
  asterix.categorylevel(nodeid, cat, _, _, _, _),
  solr.review_text(reviewid, asin, length, avg_word_length, number_word_capital, number_exlamation_question, avg_sentence_length, tfidf, reviewText),
Ans(cat, total_cat_value) <-
  setof({pid}, {nunits}, {price}, {length}, view.product_cat_view(pid, cat, nunits, price, length), S), sum(price, total_cat_value)
'''
result = qe.queryDatalogRaw(datalog, loglevel=logging.INFO)
result

<qe.HybridEngine> query datalog:
[{'condition': [],
  'groupby': {'aggregation': ['sum(price,total_cat_value)'], 'key': 'cat'},
  'result': 'Ans(cat,total_cat_value)',
  'table': ['postgres.orderlines(olid, oid, pid, _, date, _, nunits, price)',
            'postgres.products(pid, _, _, _, _, _, asin, nodeid)',
            'asterix.categorylevel(nodeid, cat, _, _, _, _)',
            'solr.review_text(reviewid, asin, length, avg_word_length, '
            'number_word_capital, number_exlamation_question, '
            'avg_sentence_length, tfidf, reviewText)']}]

<qe.AsterixEngine> query sql++ cmd:
use TinySocial;
WITH categorylevel as (
select c.nodeID AS nodeid, 
c.category.nested.level_1 , 
c.category.nested.nested.level_2, c.category.nested.nested.nested.level_3,
c.category.nested.nested.nested.nested.level_4,
c.category.nested.nested.nested.nested.nested.level_5
from ClassificationInfo c 
)
SELECT categorylevel.level_1 as cat, categorylevel.nodeid FROM categorylevel

<qe.PostgresE

solr url: http://localhost:8983/solr/bookstore/tvrh/?tv=true&wt=json&tv.fl=reviewText&fl=id%2Casin%2CreviewerID%2CreviewText&tv.df=true&tv.tf=true&q=%2A%3A%2A&tv.tf_idf=true&indent=true


<qe.Combiner> combining result for single subquery


Unnamed: 0,cat,total_cat_value
0,Arts & Photography,3008791.0


In [6]:
# union of multi source & aggregation
datalog = '''
cat_sum1(lvl1, oid, nunits, price) <-
  postgres.orderlines(olid, oid, pid, _, date, _, nunits, price),
  postgres.products(pid, _, _, _, _, _, asin, nodeid),
  asterix.categorylevel(nodeid, lvl1, _, _, _, _),
  pid < 10100, date > '2015-01-01'
ans(lvl1, total_units, total_value) <-
  setof({oid}, {nunits}, {price}, view.cat_sum1(lvl1, oid, nunits, price), S), sum(nunits, total_units),sum(price,total_value)
cat_sum2(lvl1, oid, nunits, price) <-
  postgres.orderlines(olid, oid, pid, _, date, _, nunits, price),
  postgres.products(pid, _, _, _, _, _, asin, nodeid),
  asterix.categorylevel(nodeid, lvl1, _, _, _, _),
  pid > 10050, pid < 10200
ans(lvl1, total_units, total_value) <-
  setof({oid}, {nunits}, {price}, view.cat_sum2(lvl1, oid, nunits, price), S), sum(nunits, total_units),sum(price,total_value)
'''

result = qe.queryDatalogRaw(datalog, loglevel=logging.DEBUG)
result

<qe.HybridEngine> query datalog:
[{'condition': ['pid < 10100', " date > '2015-01-01'"],
  'groupby': {'aggregation': ['sum(nunits,total_units)',
                              'sum(price,total_value)'],
              'key': 'lvl1'},
  'result': 'ans(lvl1,total_units,total_value)',
  'table': ['postgres.orderlines(olid, oid, pid, _, date, _, nunits, price)',
            'postgres.products(pid, _, _, _, _, _, asin, nodeid)',
            'asterix.categorylevel(nodeid, lvl1, _, _, _, _)']},
 {'condition': ['pid > 10050', ' pid < 10200'],
  'groupby': {'aggregation': ['sum(nunits,total_units)',
                              'sum(price,total_value)'],
              'key': 'lvl1'},
  'result': 'ans(lvl1,total_units,total_value)',
  'table': ['postgres.orderlines(olid, oid, pid, _, date, _, nunits, price)',
            'postgres.products(pid, _, _, _, _, _, asin, nodeid)',
            'asterix.categorylevel(nodeid, lvl1, _, _, _, _)']}]

<qe.DatalogParser> parser structure:
{'_result': 'ans(lv

<qe.HybridEngine> arbiter submit query to asterix
{'column_idx': {'categorylevel': {'_': 5, 'lvl1': 1, 'nodeid': 0}},
 'columns': {'categorylevel': ['nodeid', 'lvl1', '_', '_', '_', '_'],
             'orderlines': ['olid',
                            'oid',
                            'pid',
                            '_',
                            'date',
                            '_',
                            'nunits',
                            'price'],
             'products': ['pid', '_', '_', '_', '_', '_', 'asin', 'nodeid']},
 'conditions': None,
 'groupby': None,
 'join': None,
 'limit': None,
 'orderby': None,
 'return': [{'alias': None,
             'column': 'lvl1',
             'func': None,
             'table': 'categorylevel'},
            {'alias': None,
             'column': 'nodeid',
             'func': None,
             'table': 'categorylevel'}],
 'tables': ['categorylevel'],
 'view': None}

<qe.AsterixEngine> query sql++ cmd:
use TinySocial;
WITH cate

Unnamed: 0,lvl1,total_units,total_value
0,Arts & Photography,483.0,4350967.0
1,Comics & Graphic Novels,18.0,530070.0
2,"Crafts, Hobbies & Home",18.0,649600.0
3,Humor & Entertainment,10.0,358800.0
0,Arts & Photography,16509.0,117841902.0
1,Biographies & Memoirs,303.0,7740700.0
2,Christian Books & Bibles,39.0,956542.0
3,Comics & Graphic Novels,99.0,2186038.0
4,"Crafts, Hobbies & Home",3218.0,11052955.0
5,Humor & Entertainment,231.0,4569520.0
