In [1]:
import pandas as pd
import sys
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import numpy as np

ServerName = "localhost"
Database = "SQLBook"

# Create the connection
engine = create_engine('postgresql://postgres:@' + ServerName + ':5432/' + Database)

### queries on AsterixDB
Connecting to AsterixDB(through http on localhost:19002) with lib by Jared

In [2]:
# %load asterixdb_python.py
from urllib import parse, request
from json import loads

class QueryResponse:
    def __init__(self, raw_response):
        self._json = loads(str(raw_response))

        self.requestID = self._json['requestID'] if 'requestID' in self._json else None
        self.clientContextID = self._json['clientContextID'] if 'clientContextID' in self._json else None
        self.signature = self._json['signature'] if 'signature' in self._json else None
        self.results = self._json['results'] if 'results' in self. _json else None
        self.metrics = self._json['metrics'] if 'metrics' in self._json else None

class AsterixConnection:
    def __init__(self, server = 'http://localhost', port = 19002):
        self._server = server
        self._port = port
        self._url_base = self._server +':'+ str(port)

    def query(self, statement, pretty=False, client_context_id=None):
        endpoint = '/query/service'

        url = self._url_base + endpoint

        payload = {
            'statement': statement,
            'pretty': pretty
        }

        if client_context_id:
            payload['client_context_id'] = client_context_id

        data = parse.urlencode(payload).encode("utf-8")
        req = request.Request(url, data)
        resource = request.urlopen(req)
        #decode = resource.headers.get_content_charset()
        decode = 'iso-8859-1'
        str_data = resource.read().decode(decode)
        return QueryResponse(str_data)


if __name__ == '__main__':
    asterix_conn = AsterixConnection()
    response = asterix_conn.query('''
        use TinySocial;
        SELECT VALUE ds FROM Metadata.`Dataset` ds;
        SELECT VALUE ix FROM Metadata.`Index` ix;''')

    print(response.results)

[{'GroupName': 'MetadataGroup', 'Hints': [], 'DatasetName': 'CompactionPolicy', 'DatatypeDataverseName': 'Metadata', 'PendingOp': 0, 'DatasetType': 'INTERNAL', 'DataverseName': 'Metadata', 'CompactionPolicy': 'prefix', 'CompactionPolicyProperties': [{'Name': 'max-mergable-component-size', 'Value': '1073741824'}, {'Name': 'max-tolerance-component-count', 'Value': '5'}], 'DatasetId': 13, 'InternalDetails': {'PartitioningStrategy': 'HASH', 'PrimaryKey': [['DataverseName'], ['CompactionPolicy']], 'FileStructure': 'BTREE', 'PartitioningKey': [['DataverseName'], ['CompactionPolicy']], 'Autogenerated': False}, 'Timestamp': 'Sat Sep 30 15:41:19 PDT 2017', 'DatatypeName': 'CompactionPolicyRecordType'}, {'GroupName': 'MetadataGroup', 'Hints': [], 'DatasetName': 'Dataset', 'DatatypeDataverseName': 'Metadata', 'PendingOp': 0, 'DatasetType': 'INTERNAL', 'DataverseName': 'Metadata', 'CompactionPolicy': 'prefix', 'CompactionPolicyProperties': [{'Name': 'max-mergable-component-size', 'Value': '1073741

### wrap ClassificationInfo dataset with a class

In [25]:
# %load classification_info.py
# Wrapper of ClassificationInfo dataset
from asterixdb_python import AsterixConnection 

class ClassificationInfo:
    def __init__(self, server = 'http://localhost', port = 19002, stat=False):
        self.asterix_conn = AsterixConnection(server, port)
        self.init_stat = stat
        if stat:
            self.get_stat()

    def get_stat(self):
        cmd = '''
use TinySocial;
select (select distinct value c.category.nested.level_1 
from ClassificationInfo c ) as level_1,
(select distinct value c.category.nested.nested.level_2
from ClassificationInfo c ) as level_2,
(select distinct value c.category.nested.nested.nested.level_3
from ClassificationInfo c ) as level_3,
(select distinct value c.category.nested.nested.nested.nested.level_4
from ClassificationInfo c ) as level_4,
(select distinct value c.category.nested.nested.nested.nested.nested.level_5
from ClassificationInfo c ) as level_5,
(select distinct value c.category.nested.nested.nested.nested.nested.nested.level_6
from ClassificationInfo c ) as level_6,
(select distinct value c.category.nested.nested.nested.nested.nested.nested.nested.level_7
from ClassificationInfo c ) as level_7,
(select distinct value c.category.nested.nested.nested.nested.nested.nested.nested.nested.level_8
from ClassificationInfo c ) as level_8
;
'''
        response = self.asterix_conn.query(cmd)
        levels = response.results[0]
        self.levels = {}
        for l in levels:
            if levels[l][0] is not None:
                self.levels[l] = levels[l]
        return self.levels

    def print_level_stat(self):
        if self.init_stat == False:
            print("Please call get_stat() first\n")
            return
        level_counts = {}
        for l in levels:
            count = len(self.levels[l])
            level_counts[l] = count
            print(l,':', count)
        print("total:",sum(level_counts.values()))
        
    def get_flat_records(self):
        cmd = '''
use TinySocial;
select c.nodeID, 
c.category.nested.level_1 , 
c.category.nested.nested.level_2, c.category.nested.nested.nested.level_3,
c.category.nested.nested.nested.nested.level_4,
c.category.nested.nested.nested.nested.nested.level_5
from ClassificationInfo c 
'''
        return self.asterix_conn.query(cmd).results
    
    def get_nodes_by_level(self, levels, **kwargs):
        ''' return all the nodes that match the levels in their hierarchy'''
        condition = ' and '.join(["m.{} = \"{}\"".format(l, levels[l]) for l in levels])
        cmd = '''
use TinySocial;
With node_map AS (
select c.nodeID AS nodeid, 
c.category.nested.level_1 , 
c.category.nested.nested.level_2, c.category.nested.nested.nested.level_3,
c.category.nested.nested.nested.nested.level_4,
c.category.nested.nested.nested.nested.nested.level_5
from ClassificationInfo c 
)
'''
        if id in kwargs:  # return nodeid as list
            cmd += '''
select value m.nodeid
from node_map m
'''
        else:  # return structure with both nodeid and level info
            cmd += '''
select value m
from node_map m
'''
        cmd += "where {};".format(condition)
        #print(cmd)
        return self.asterix_conn.query(cmd).results
    
    def get_nodes_by_category(self, categories, **kwargs):
        ''' return all the nodes that satisfy the categories in their hierarchy'''
        condition = ' and '.join(["\"{}\" in m.level".format(cat) for cat in categories])
        cmd = '''
use TinySocial;
With  node_map AS (
select value {'nodeid': c.nodeID, 'level' : [
c.category.nested.level_1,
c.category.nested.nested.level_2, c.category.nested.nested.nested.level_3,
c.category.nested.nested.nested.nested.level_4,
c.category.nested.nested.nested.nested.nested.level_5]
}
from ClassificationInfo c 
)
'''
        if id in kwargs:  # return nodeid as list
            cmd += '''
select value m.nodeid
from node_map m
'''
        else:  # return structure with both nodeid and level info
            cmd += '''
select m.nodeid, m.level
from node_map m
'''
        cmd += "where {};".format(condition)
        print(cmd)
        return self.asterix_conn.query(cmd).results
    


In [26]:
classinfo = ClassificationInfo()
levels = classinfo.get_stat()

In [6]:
# total number of categories of each level
classinfo.print_level_stat()

level_4 : 1883
level_1 : 31
level_5 : 430
level_3 : 2498
level_2 : 419
total: 5261


In [75]:
# check intra level duplicate
for l in levels:
    print(len(set(levels[l])), len(levels[l]))
# check inter level duplicate
merged = []
for l in levels:
    merged = levels[l] + merged
    
print(len(merged), len(set(merged)))

430 430
2498 2498
31 31
1883 1883
419 419
5261 4064


In [78]:
# flatten nested level
records = classinfo.get_flat_records()

In [79]:
print('total nodeID:', len(records))
records[:3]

total nodeID: 8121


[{'level_1': 'Arts & Photography',
  'level_2': 'N/A',
  'level_3': 'N/A',
  'level_4': 'N/A',
  'level_5': 'N/A',
  'nodeID': 1},
 {'level_1': 'Arts & Photography',
  'level_2': 'Architecture',
  'level_3': 'N/A',
  'level_4': 'N/A',
  'level_5': 'N/A',
  'nodeID': 173508},
 {'level_1': 'Arts & Photography',
  'level_2': 'Architecture',
  'level_3': 'Buildings',
  'level_4': 'N/A',
  'level_5': 'N/A',
  'nodeID': 266162}]

In [102]:
classinfo.get_nodes_by_category(['Architecture', "Security Design"])

[{'level': ['Arts & Photography',
   'Architecture',
   'Security Design',
   'N/A',
   'N/A'],
  'nodeid': 7743006011}]

In [10]:
# count node numbers that belongs to each of 31 level_1, the total should match 8121
total = 0
for l in levels['level_1']:
    nodes = classinfo.get_nodes_by_level({'level_1':l})
    total += len(nodes)
    print(l, ':', len(nodes))
print('total :', total)

Arts & Photography : 389
Biographies & Memoirs : 123
Business & Money : 253
Children's Books : 793
Christian Books & Bibles : 288
Comics & Graphic Novels : 60
Computers & Technology : 435
Cookbooks, Food & Wine : 226
Crafts, Hobbies & Home : 347
Education & Teaching : 147
Engineering & Transportation : 272
Gay & Lesbian : 39
Health, Fitness & Dieting : 258
History : 384
Humor & Entertainment : 184
Law : 144
Literature & Fiction : 428
Medical Books : 288
Mystery, Thriller & Suspense : 59
Parenting & Relationships : 47
Politics & Social Sciences : 215
Reference : 170
Religion & Spirituality : 534
Romance : 81
Science & Math : 432
Science Fiction & Fantasy : 60
Self-Help : 47
Sports & Outdoors : 233
Teen & Young Adult : 492
Test Preparation : 42
Travel : 651
total : 8121


In [128]:
nodes[:3]

[{'level_1': 'Travel',
  'level_2': 'N/A',
  'level_3': 'N/A',
  'level_4': 'N/A',
  'level_5': 'N/A',
  'nodeid': 27},
 {'level_1': 'Travel',
  'level_2': 'Adventure Travel',
  'level_3': 'N/A',
  'level_4': 'N/A',
  'level_5': 'N/A',
  'nodeid': 17207},
 {'level_1': 'Travel',
  'level_2': 'Adventure Travel',
  'level_3': 'Canoeing',
  'level_4': 'N/A',
  'level_5': 'N/A',
  'nodeid': 290115}]

In [9]:
# return nodeid only
nodes = classinfo.get_nodes_by_category(['Architecture'], id=True)
nodes[:3]

[173508, 266162, 720870]

In [27]:
nodes = classinfo.get_nodes_by_category(['History', 'Architecture'])



use TinySocial;
With  node_map AS (
select value {'nodeid': c.nodeID, 'level' : [
c.category.nested.level_1,
c.category.nested.nested.level_2, c.category.nested.nested.nested.level_3,
c.category.nested.nested.nested.nested.level_4,
c.category.nested.nested.nested.nested.nested.level_5]
}
from ClassificationInfo c 
)

select m.nodeid, m.level
from node_map m
where "History" in m.level and "Architecture" in m.level;


In [24]:
nodes

[{'level': ['Arts & Photography', 'Architecture', 'History', 'N/A', 'N/A'],
  'nodeid': 15762881},
 {'level': ['Arts & Photography',
   'Architecture',
   'History',
   'Baroque & Rococo',
   'N/A'],
  'nodeid': 3564987011},
 {'level': ['Arts & Photography', 'Architecture', 'History', 'General', 'N/A'],
  'nodeid': 15762891},
 {'level': ['Arts & Photography',
   'Architecture',
   'History',
   'Modern & Contemporary',
   'N/A'],
  'nodeid': 3564988011},
 {'level': ['Arts & Photography',
   'Architecture',
   'History',
   'Modern & Contemporary',
   'Contemporary'],
  'nodeid': 8929882011},
 {'level': ['Arts & Photography',
   'Architecture',
   'History',
   'Modern & Contemporary',
   'Modern'],
  'nodeid': 8929883011},
 {'level': ['Arts & Photography',
   'Architecture',
   'History',
   'Prehistoric & Primitive',
   'N/A'],
  'nodeid': 3564989011},
 {'level': ['Arts & Photography',
   'Architecture',
   'History',
   'Romanticism',
   'N/A'],
  'nodeid': 3564990011}]