In [2]:
import numpy as np
import pandas as pd

In [3]:
!pip install pyTigerGraph

Collecting pyTigerGraph
  Downloading pyTigerGraph-0.0.9.9.2-py3-none-any.whl (22 kB)
Collecting validators
  Downloading validators-0.18.2-py3-none-any.whl (19 kB)
Collecting pyTigerDriver
  Downloading pyTigerDriver-1.0.14-py3-none-any.whl (8.7 kB)
Installing collected packages: validators, pyTigerDriver, pyTigerGraph
Successfully installed pyTigerDriver-1.0.14 pyTigerGraph-0.0.9.9.2 validators-0.18.2


In [4]:
import pyTigerGraph as tg

In [5]:
questions = ['who all bought Agilent Technologies Inc',
             'who sold Agilent Technologies Inc',
             'who bought Microsoft Corporation in the year 2018',
             'how are Microsoft Corporation and Facebook Inc related',
             'how many bought Microsoft Corporation in the year 2021',
             'what all beat the market expectations in the year 2018',
             'what was happen with Warren Buffett 5 years ago']

In [6]:
question = questions[0]

In [7]:
import spacy
nlp_pipeline = spacy.load("en_core_web_sm")

In [8]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [9]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

In [10]:
class EntityExtractor : 
    """This class is to extract Entities from a textual content using BERT."""
    
    def __init__(self) -> None:
        """
        constructor method. Config and Logger instances have to be passed on from the caller.
        """        
        self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
        self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
        
        self.nlp_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer)
        self.confidence_score = 0.7 # 0.7


    def get_entities_bert(self, input_content):
        """
        uses BERT to extract the entities from the given input_content.
        input_content : a line of string. 
        """
        ner_results = self.nlp_pipeline(input_content)

        entityList = []
        current_token = ''
        last_index = 0
        last_token_apostrophe = False

        #filter entites with less confidence
        confidence_score =  0.7 if self.confidence_score == None else self.confidence_score
        filtered_results = list(filter(lambda x: x['score'] > confidence_score, ner_results))

        for entity in filtered_results: 

            if entity['word'].startswith('##'): #bert specific prefix
                #appending to the last one
                current_token += entity['word'][2:]
                
                existing_entity = entityList[-1]
                existing_entity['token'] = current_token
                existing_entity['entity'] = entity['entity']
                existing_entity['index'] = entity['start']

            elif entity['word'] == "'": #apostrophe
                last_token_apostrophe = True
                current_token += entity['word'] 
                # #appending to last token
                existing_entity = entityList[-1]
                existing_entity['token'] = current_token
                existing_entity['entity'] = entity['entity']
                existing_entity['index'] = entity['start']

            elif last_token_apostrophe == True:
                current_token += entity['word'] 
                #appending to last token
                existing_entity = entityList[-1]
                existing_entity['token'] = current_token
                existing_entity['entity'] = entity['entity']
                existing_entity['index'] = entity['start']
                last_token_apostrophe = False

            elif ((entity['index'] - last_index) <= 1 and (last_index != 0)):
                current_token += ' '+ entity['word']  
                #appending to last token
                existing_entity = entityList[-1]
                existing_entity['token'] = current_token
                existing_entity['entity'] = entity['entity']
                existing_entity['index'] = entity['start']

            else:
                current_token = entity['word']
                entityList.append({ 'token' : current_token, 'entity' : entity['entity'], 'index': entity['start'], 'startIndex': entity['start'] })

            last_index = entity['index']  
            
        filter_one_letter_tokens = filter(lambda x: len(x['token']) > 1, entityList)

        return list(filter_one_letter_tokens)


In [11]:
entity_extractor = EntityExtractor()

In [12]:
question_content = "who all bought Microsoft" + " " + "who all bought Microsoft" + " " + "who all bought Microsoft"
entities = entity_extractor.get_entities_bert(question_content)
print(entities)

[{'token': 'Microsoft', 'entity': 'B-ORG', 'index': 15, 'startIndex': 15}, {'token': 'Microsoft', 'entity': 'B-ORG', 'index': 40, 'startIndex': 40}, {'token': 'Microsoft', 'entity': 'B-ORG', 'index': 65, 'startIndex': 65}]


In [13]:
#question_content = question + " " + question + " " + question
question_content = "who all bought Microsoft" + " " + "who all bought Amazon" + " " + "who all bought Facebook when William Gates and Warren Buffett was in New York and Tokyo"
entities = entity_extractor.get_entities_bert(question_content)

params_dict = {}

orgs_count = 0
objs_count = 0
locs_count = 0
pers_count = 0

organization_suffix = 'ORGANIZATION'
object_suffix = 'OBJECT'
person_suffix = 'PERSON'
location_suffix = 'LOCATION'

for entity in entities:
  entity_type = entity['entity']
  if entity_type == 'I-ORG' or entity_type == 'B-ORG':
    orgs_count += 1
  elif entity_type == 'I-PER' or entity_type == 'B-PER':
    pers_count += 1
  elif entity_type == 'I-LOC' or entity_type == 'B-LOC':
    locs_count += 1
  elif entity_type == 'I-MISC' or entity_type == 'B-MISC':
    objs_count += 1

org_index = 0
loc_index = 0
pers_index = 0

index_offset = 0

for entity in entities:
  entity_type = entity['entity']
  replace_token = ''
  if entity_type == 'I-ORG' or entity_type == 'B-ORG':
    if orgs_count > 1:
      org_index += 1
      replace_token = str.format("{{{} {}}}", organization_suffix, org_index)
    else:
      replace_token = str.format("{{{}}}", organization_suffix)

  elif entity_type == 'I-PER' or entity_type == 'B-PER':
    if pers_count > 1:
      pers_index += 1
      replace_token = str.format("{{{} {}}}", person_suffix, pers_index)
    else:
      replace_token = str.format("{{{}}}", person_suffix)

  elif entity_type == 'I-LOC' or entity_type == 'B-LOC':
    if locs_count > 1:
      loc_index += 1
      replace_token = str.format("{{{} {}}}", location_suffix, loc_index)
    else:
      replace_token = str.format("{{{}}}", location_suffix)

  start_index = entity['startIndex'] - index_offset
  end_index = entity['startIndex'] - index_offset + len(entity['token'])
  question_content = question_content[:start_index] + replace_token + question_content[end_index:]
  index_offset = index_offset + (len(entity['token']) - len(replace_token))
  params_dict[replace_token] = entity['token']
    

print(question_content)

who all bought {ORGANIZATION 1} who all bought {ORGANIZATION 2} who all bought {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2}


In [14]:
params_dict

{'{LOCATION 1}': 'New York',
 '{LOCATION 2}': 'Tokyo',
 '{ORGANIZATION 1}': 'Microsoft',
 '{ORGANIZATION 2}': 'Amazon',
 '{ORGANIZATION 3}': 'Facebook',
 '{PERSON 1}': 'William Gates',
 '{PERSON 2}': 'Warren Buffett'}

In [15]:
def get_pos_sentence(input_content):
    """
    returns the POS tags of a given sentence.
    output format: objects of {'token' : '', 'pos' : '', 'lemma': ''', 'index': 0}
    """
    posList = []

    nlp_doc = nlp_pipeline(input_content)
    for token in nlp_doc: 
        posList.append({ 'token' : token.text, 'pos' : token.pos_, 'lemma': token.lemma_, 'index': token.idx })
    return posList

In [16]:
question_content1 = 'who all bought {ORGANIZATION 1} who all {ORGANIZATION 2} who all {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2}'#question_content
pos_tokens = get_pos_sentence(question_content1)

In [17]:
print(pos_tokens)

[{'token': 'who', 'pos': 'PRON', 'lemma': 'who', 'index': 0}, {'token': 'all', 'pos': 'DET', 'lemma': 'all', 'index': 4}, {'token': 'bought', 'pos': 'VERB', 'lemma': 'buy', 'index': 8}, {'token': '{', 'pos': 'PUNCT', 'lemma': '{', 'index': 15}, {'token': 'ORGANIZATION', 'pos': 'PROPN', 'lemma': 'ORGANIZATION', 'index': 16}, {'token': '1', 'pos': 'NUM', 'lemma': '1', 'index': 29}, {'token': '}', 'pos': 'PUNCT', 'lemma': '}', 'index': 30}, {'token': 'who', 'pos': 'PRON', 'lemma': 'who', 'index': 32}, {'token': 'all', 'pos': 'DET', 'lemma': 'all', 'index': 36}, {'token': '{', 'pos': 'PUNCT', 'lemma': '{', 'index': 40}, {'token': 'ORGANIZATION', 'pos': 'PROPN', 'lemma': 'ORGANIZATION', 'index': 41}, {'token': '2', 'pos': 'NUM', 'lemma': '2', 'index': 54}, {'token': '}', 'pos': 'PUNCT', 'lemma': '}', 'index': 55}, {'token': 'who', 'pos': 'PRON', 'lemma': 'who', 'index': 57}, {'token': 'all', 'pos': 'DET', 'lemma': 'all', 'index': 61}, {'token': '{', 'pos': 'PUNCT', 'lemma': '{', 'index': 65

In [18]:
verbs_count = 0
verb_suffix = 'VERB'
verb_index = 0

index_offset = 0

for pos in pos_tokens:
  if pos['pos'] == 'VERB':
    verbs_count += 1

for pos in pos_tokens:
  replace_token = ''
  if pos['pos'] == 'VERB':
    if verbs_count > 1:
      verb_index += 1
      replace_token = str.format("{{{} {}}}", verb_suffix, verb_index)
    else:
      replace_token = str.format("{{{}}}", verb_suffix)

    start_index = pos['index'] - index_offset
    end_index = pos['index'] - index_offset + len(pos['token'])
    question_content1 = question_content1[:start_index] + replace_token + question_content1[end_index:]
    index_offset = index_offset + (len(pos['token']) - len(replace_token))
    params_dict[replace_token] = pos['lemma'] # this is important --> lemma

In [19]:
print(question_content1)

who all {VERB} {ORGANIZATION 1} who all {ORGANIZATION 2} who all {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2}


In [20]:
params_dict

{'{LOCATION 1}': 'New York',
 '{LOCATION 2}': 'Tokyo',
 '{ORGANIZATION 1}': 'Microsoft',
 '{ORGANIZATION 2}': 'Amazon',
 '{ORGANIZATION 3}': 'Facebook',
 '{PERSON 1}': 'William Gates',
 '{PERSON 2}': 'Warren Buffett',
 '{VERB}': 'buy'}

In [21]:
question_content2 = 'who all {VERB} {ORGANIZATION 1} who all {ORGANIZATION 2} who all {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2} in the year 2009 and the year 2022 and 2033 between the date 2022-11-11 and 2005-11-10' #question_content


In [22]:
year_regex = "(?:before)?(?:after)?(?:in)?(?:between)?(?:Year)?(?:YEAR)?(?:year)?(?:s)?\d\d\d\d\s"
date_regex = "\d\d\d\d[/-]\d\d[/-]\d\d"

In [23]:
import re 

In [24]:
date_matches = re.split(date_regex, question_content2)
date_values = re.findall(date_regex, question_content2)
print(date_matches)
print(date_values)

['who all {VERB} {ORGANIZATION 1} who all {ORGANIZATION 2} who all {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2} in the year 2009 and the year 2022 and 2033 between the date ', ' and ', '']
['2022-11-11', '2005-11-10']


In [25]:
question_content4 = date_matches[0]
date_index = 0
date_count = len(date_values)
date_suffix = 'DATE'

for index in range(len(date_matches) -1):
  replace_token = ''
  if date_count > 1:
    date_index += 1
    replace_token = str.format("{{{} {}}}", date_suffix, date_index)
  else:
    replace_token = str.format("{{{}}}", date_suffix)

  question_content4 +=  replace_token + date_matches[index+1]
  if index == len(date_matches) -1:
    break
  params_dict[replace_token] = date_values[index]
  

In [26]:
question_content4

'who all {VERB} {ORGANIZATION 1} who all {ORGANIZATION 2} who all {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2} in the year 2009 and the year 2022 and 2033 between the date {DATE 1} and {DATE 2}'

In [27]:
params_dict

{'{DATE 1}': '2022-11-11',
 '{DATE 2}': '2005-11-10',
 '{LOCATION 1}': 'New York',
 '{LOCATION 2}': 'Tokyo',
 '{ORGANIZATION 1}': 'Microsoft',
 '{ORGANIZATION 2}': 'Amazon',
 '{ORGANIZATION 3}': 'Facebook',
 '{PERSON 1}': 'William Gates',
 '{PERSON 2}': 'Warren Buffett',
 '{VERB}': 'buy'}

In [28]:
year_matches = re.split(year_regex, question_content4)
year_values = re.findall(year_regex, question_content4)
print(year_matches)
print(year_values)
print(len(year_matches))

['who all {VERB} {ORGANIZATION 1} who all {ORGANIZATION 2} who all {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2} in the year ', 'and the year ', 'and ', 'between the date {DATE 1} and {DATE 2}']
['2009 ', '2022 ', '2033 ']
4


In [29]:
question_content5 = year_matches[0]
year_index = 0
year_count = len(year_values)
year_suffix = 'YEAR'

for index in range(len(year_matches) -1):
  replace_token = ''
  if year_count > 1:
    year_index += 1
    replace_token = str.format("{{{} {}}}", year_suffix, year_index)
  else:
    replace_token = str.format("{{{}}}", year_suffix)

  next_match_string = year_matches[index+1] if year_matches[index+1].startswith(' ') else ' ' + year_matches[index+1]
  question_content5 +=  replace_token + next_match_string
  if index == len(year_matches) -1:
    break
  params_dict[replace_token] = str.strip(year_values[index])

In [30]:
question_content5

'who all {VERB} {ORGANIZATION 1} who all {ORGANIZATION 2} who all {ORGANIZATION 3} when {PERSON 1} and {PERSON 2} was in {LOCATION 1} and {LOCATION 2} in the year {YEAR 1} and the year {YEAR 2} and {YEAR 3} between the date {DATE 1} and {DATE 2}'

In [31]:
params_dict

{'{DATE 1}': '2022-11-11',
 '{DATE 2}': '2005-11-10',
 '{LOCATION 1}': 'New York',
 '{LOCATION 2}': 'Tokyo',
 '{ORGANIZATION 1}': 'Microsoft',
 '{ORGANIZATION 2}': 'Amazon',
 '{ORGANIZATION 3}': 'Facebook',
 '{PERSON 1}': 'William Gates',
 '{PERSON 2}': 'Warren Buffett',
 '{VERB}': 'buy',
 '{YEAR 1}': '2009',
 '{YEAR 2}': '2022',
 '{YEAR 3}': '2033'}

In [32]:
intermediary_language = 'VERTEX Person | CONDITION name = {PERSON} | EDGE any | CONDITION year >= {YEAR 1} AND year <= {YEAR 2} | VERTEX any'

In [33]:
il_tokens = intermediary_language.split('|')
print(il_tokens)

['VERTEX Person ', ' CONDITION name = {PERSON} ', ' EDGE any ', ' CONDITION year >= {YEAR 1} AND year <= {YEAR 2} ', ' VERTEX any']


In [34]:
general_format_gsql = '''
INTERPRET QUERY () FOR GRAPH {{GRAPH_NAME}} { 
  ListAccum <EDGE> @@edgeList;
  seed =  SELECT p FROM Person:p WHERE p.name=="Warren Buffett";
  targets = SELECT tgt
            FROM seed:s-(:e)-:tgt
            WHERE e.year >= 2018 AND e.year <= 2020
            ACCUM @@edgeList += e;
  PRINT @@edgeList;
  PRINT targets;
  }
'''

In [35]:
general_format_2 = '''
INTERPRET QUERY () FOR GRAPH athenagraph1304 { 
  ListAccum <EDGE> @@edges;
  t1 = SELECT tgt
            FROM :s-(r_buy:e)-:tgt
            WHERE e.year >= 2014 AND e.year <= 2020 AND s.name like "%Warren Buffett%"
            ACCUM @@edges += e;
  PRINT @@edges;
  PRINT t1;
  }'''

In [36]:
intermediary_language1 = 'VERTEX any | CONDITION any | EDGE {VERB} | CONDITION any | VERTEX Organization | CONDITION name = {ORGANIZATION}'

In [100]:
SELECT_FORMAT_DEGREE1 = """
INTERPRET QUERY () FOR GRAPH {} {{
ListAccum <EDGE> @@edges;
t1 = {} 
    ACCUM @@edges += {};
PRINT @@edges;
PRINT t1;
}}
"""

In [99]:
SELECT_COUNT_FORMAT_DEGREE1 = """
INTERPRET QUERY () FOR GRAPH {} {{
SumAccum <EDGE> @@edges;
t1 = {} 
    ACCUM @@edges += 1;
PRINT @@edges;
PRINT t1;
}}
"""

In [101]:
SELECT_FORMAT_DEGREE2 = """
INTERPRET QUERY () FOR GRAPH {} {{
ListAccum <EDGE> @@edges;
t1 = {}
t2 = {}
    ACCUM @@edges += {};
PRINT @@edges;
PRINT t2;
}}
"""

In [102]:
SELECT_COUNT_FORMAT_DEGREE2 = """
INTERPRET QUERY () FOR GRAPH {} {{
SumAccum <EDGE> @@edges;
t1 = {}
t2 = {}
    ACCUM @@edges += 1;
PRINT @@edges;
PRINT t2;
}}
"""

In [103]:
SELECT_FORMAT_DEGREE3 = """
INTERPRET QUERY () FOR GRAPH {} {{
ListAccum <EDGE> @@edges;
t1 = {}
t2 = {}
t3 = {}
    ACCUM @@edges += {};
PRINT @@edges;
PRINT t3;
}}
"""

In [104]:
SELECT_COUNT_FORMAT_DEGREE3 = """
INTERPRET QUERY () FOR GRAPH {} {{
SumAccum <EDGE> @@edges;
t1 = {}
t2 = {}
t3 = {}
    ACCUM @@edges += 1;
PRINT @@edges;
PRINT t3;
}}
"""

In [86]:
SELECT_FORMAT_BASIC = """
 SELECT {} FROM {} 
"""

WHERE_FORMAT_BASIC = """
 WHERE {} 
"""

In [111]:
def generate_gsql_from_intermediate_language(il):
  il_tokens = il.split('|')

  conditions_stack = []
  units_stack = []

  for il_token in il_tokens:
    il_token = str.strip(il_token)
    if il_token.startswith('CONDITION'):
      conditions_stack.append(il_token)
    else:
      units_stack.append(il_token)

  item_index = 0

  staged_units = {} # has the subsets that are processed already (ex., first hop)
  current_units = [] # has the units that are being processed currently, will get reset.
  current_format = []
  units_aliases = []
  final_edge_alias = ''

  subset_index = 0

  for index in range(len(units_stack)):
    unit_token = units_stack[index]
    condition_token = conditions_stack[index]

    if unit_token.startswith('VERTEX'):
      vertex_token = VertexToken(unit_token, condition_token, item_index)
      output_str, condition_str, item_alias = vertex_token.get_string()
      current_format.append('V')
      current_units.append((output_str, condition_str)) #tuple
      units_aliases.append(item_alias)
    elif unit_token.startswith('EDGE'):
      edge_token = EdgeToken(unit_token, condition_token, item_index)
      output_str, condition_str, item_alias = edge_token.get_string()
      current_format.append('E')
      current_units.append((output_str, condition_str)) #tuple
      units_aliases.append(item_alias)
      final_edge_alias = item_alias

    if "".join(current_format) == 'VEV':
      #process here
      select_str = "-".join([unit[0] for unit in current_units])
      where_str = " AND ".join([unit[1] for unit in current_units if str.strip(unit[1]) != ''])

      select_str_full = str.format(SELECT_FORMAT_BASIC, units_aliases[0], select_str)
      where_str_full = str.format(WHERE_FORMAT_BASIC, where_str)

      select_str_full = select_str_full + (' ' + where_str_full if str.strip(where_str) != '' else '')

      subset_index +=1

      staged_units["t"+str(subset_index)] = select_str_full
      current_format = ['S']
      current_units = []

    elif "".join(current_format) == 'SEV':
      #process here
      select_str = "-".join([unit[0] for unit in current_units])
      where_str = " AND ".join([unit[1] for unit in current_units if str.strip(unit[1]) != ''])

      #select_str = "t"+str(subset_index) + "-" + select_str 
      subset_alias = str.format("s_t{}", str(subset_index))
      select_str = str.format("t{}:{}-{}",str(subset_index), subset_alias, select_str) #including the previous subset along with the newly processed nodes

      select_str_full = str.format(SELECT_FORMAT_BASIC, subset_alias, select_str)
      where_str_full = str.format(WHERE_FORMAT_BASIC, where_str)

      select_str_full = select_str_full + (' ' + where_str_full if str.strip(where_str) != '' else '')

      subset_index +=1

      staged_units["t"+str(subset_index)] = select_str_full
      current_format = ['S']
      current_units = []

    item_index += 1
  
  return staged_units, final_edge_alias





In [112]:
generate_gsql_from_intermediate_language('VERTEX Person | CONDITION name = {PERSON} | EDGE any | CONDITION year >= {YEAR1} AND year <= {YEAR2} | VERTEX any | CONDITION any')

({'t1': '\n SELECT v0 FROM Person:v0-(:e1)-:v2 \n \n WHERE (v0.name == {PERSON}) AND (e1.year >= {YEAR1} AND e1.year <= {YEAR2}) \n'},
 'e1')

In [113]:
generate_gsql_from_intermediate_language('VERTEX any | CONDITION any | EDGE {VERB1} | CONDITION any | VERTEX Organization | CONDITION any | EDGE {VERB2} |CONDITION any | VERTEX Organization | CONDITION name = {ORGANIZATION}')

({'t1': '\n SELECT v0 FROM :v0-({VERB1}:e1)-Organization:v2 \n',
  't2': '\n SELECT s_t1 FROM t1:s_t1-({VERB2}:e3)-Organization:v4 \n \n WHERE (v4.name == {ORGANIZATION}) \n'},
 'e3')

In [79]:
condition_str_operator_dict = {}
condition_str_operator_dict['OR'] = ' OR ' 
condition_str_operator_dict['AND'] = ' AND ' 
condition_str_operator_dict['='] = ' == ' 
condition_str_operator_dict['<='] = ' <= ' 
condition_str_operator_dict['>='] = ' >= ' 
condition_str_operator_dict['<>'] = ' <> ' 
condition_str_operator_dict['<'] = ' < ' 
condition_str_operator_dict['>'] = ' > '

condition_str_variable_dict = {}
condition_str_variable_dict['name'] = ' name '
condition_str_variable_dict['happened'] = ' happened '
condition_str_variable_dict['year'] = ' year '

In [114]:
gsql_dict, final_edge_alias = generate_gsql_from_intermediate_language('VERTEX any | CONDITION any | EDGE {VERB1} | CONDITION any | VERTEX Organization | CONDITION any | EDGE {VERB2} |CONDITION any | VERTEX Organization | CONDITION name = {ORGANIZATION}')

In [108]:
gsql_format_degree_mapping = {}
gsql_format_degree_mapping[1] = {'select' : SELECT_FORMAT_DEGREE1, 'count': SELECT_COUNT_FORMAT_DEGREE1}
gsql_format_degree_mapping[2] = {'select' : SELECT_FORMAT_DEGREE2, 'count': SELECT_COUNT_FORMAT_DEGREE2}
gsql_format_degree_mapping[3] = {'select' : SELECT_FORMAT_DEGREE3, 'count': SELECT_COUNT_FORMAT_DEGREE3}

In [110]:
gsql_format_degree_mapping[len(gsql_dict.keys())]['select']

'\nINTERPRET QUERY () FOR GRAPH {} {{\nListAccum <EDGE> @@edges;\nt1 = {}\nt2 = {}\n    ACCUM @@edges += {};\nPRINT @@edges;\nPRINT t2;\n}}\n'

In [117]:
final_gsql_format = gsql_format_degree_mapping[len(gsql_dict.keys())]['select']

gsql_strings = ['athenagraph1304']
for index in range(len(gsql_dict.keys())):
  gsql_strings.append(gsql_dict['t'+str(index +1)])
gsql_strings.append(final_edge_alias)

final_gsql_without_params = str.format(final_gsql_format, *gsql_strings)
print(final_gsql_without_params)


INTERPRET QUERY () FOR GRAPH athenagraph1304 {
ListAccum <EDGE> @@edges;
t1 = 
 SELECT v0 FROM :v0-({VERB1}:e1)-Organization:v2 

t2 = 
 SELECT s_t1 FROM t1:s_t1-({VERB2}:e3)-Organization:v4 
 
 WHERE (v4.name == {ORGANIZATION}) 

    ACCUM @@edges += e3;
PRINT @@edges;
PRINT t2;
}



In [118]:
params_dict

{'{DATE 1}': '2022-11-11',
 '{DATE 2}': '2005-11-10',
 '{LOCATION 1}': 'New York',
 '{LOCATION 2}': 'Tokyo',
 '{ORGANIZATION 1}': 'Microsoft',
 '{ORGANIZATION 2}': 'Amazon',
 '{ORGANIZATION 3}': 'Facebook',
 '{PERSON 1}': 'William Gates',
 '{PERSON 2}': 'Warren Buffett',
 '{VERB}': 'buy',
 '{YEAR 1}': '2009',
 '{YEAR 2}': '2022',
 '{YEAR 3}': '2033'}

In [84]:
class VertexToken:
  def __init__(self, unit_token, condition_token, item_index):
    self.unit_token = unit_token
    self.condition_token = condition_token
    self.item_index = item_index
    self.output_str = ''
    self.condition_str = ''
    self.output_str_format = "{}:{}{}"
    self.condition_str_format = ""
    self.prefix = 'v'
    self.item_alias_format = "{}{}"
    self.item_alias = ""

  def get_string(self) -> str:
    #process condition strings
    tokens = self.unit_token.split(' ')

    for token in tokens:
      trimmed = str.strip(token)
      if  trimmed == 'VERTEX':
        continue
      elif trimmed == 'any':
        self.output_str = str.format(self.output_str_format, "", self.prefix, self.item_index)
        self.item_alias = str.format(self.item_alias_format, self.prefix, self.item_index)
        break
      else:
        self.output_str = str.format(self.output_str_format, trimmed, self.prefix, self.item_index)
        self.item_alias = str.format(self.item_alias_format, self.prefix, self.item_index)

    #process condition strings
    condition_token = ConditionToken(self.condition_token, self.item_index, self.prefix)
    self.condition_str = condition_token.get_string()

    return self.output_str, self.condition_str, self.item_alias


In [82]:
class ConditionToken:
  def __init__(self, condition_token, item_index, item_prefix):
    self.condition_token = condition_token
    self.item_index = item_index
    self.condition_str = ''
    self.condition_variable_str_format = '{}{}.{}'
    self.item_prefix = item_prefix

  def get_string(self) -> str:
    #process condition strings
    cond_tokens = self.condition_token.split(' ')

    prev_token = ''

    for c_token in cond_tokens:
      trimmed = str.strip(c_token)
      if  trimmed == 'CONDITION':
        continue
      elif trimmed == 'any':
        self.condition_str = ""
        break
      elif trimmed in condition_str_operator_dict:
        self.condition_str += condition_str_operator_dict[trimmed]  
      elif trimmed in condition_str_variable_dict:
        self.condition_str += str.format(self.condition_variable_str_format, self.item_prefix, self.item_index, trimmed)  
      else:
        self.condition_str += trimmed  

      prev_token = trimmed

    final_str = str.format("({})", self.condition_str) if str.strip(self.condition_str) != '' else ''

    return final_str

In [85]:
class EdgeToken:
  def __init__(self, unit_token, condition_token, item_index):
    self.unit_token = unit_token
    self.condition_token = condition_token
    self.item_index = item_index
    self.output_str = ''
    self.condition_str = ''
    self.output_str_format = "({}:{}{})"
    self.condition_str_format = ""
    self.prefix = 'e'
    self.item_alias_format = "{}{}"
    self.item_alias = ""

  def get_string(self) -> str:
    #process condition strings
    tokens = self.unit_token.split(' ')

    for token in tokens:
      trimmed = str.strip(token)
      if  trimmed == 'EDGE':
        continue
      elif trimmed == 'any':
        self.output_str = str.format(self.output_str_format, "", self.prefix, self.item_index)
        self.item_alias = str.format(self.item_alias_format, self.prefix, self.item_index)
        break
      else:
        self.output_str = str.format(self.output_str_format, trimmed, self.prefix, self.item_index)
        self.item_alias = str.format(self.item_alias_format, self.prefix, self.item_index)

    #process condition strings
    condition_token = ConditionToken(self.condition_token, self.item_index, self.prefix)
    self.condition_str = condition_token.get_string()

    return self.output_str, self.condition_str, self.item_alias