Skip to content
Permalink
Browse files

Untyped grammar (#1986)

* rework grammar a bit so we can have untyped variables

* add a way to make the grammar variable free

* refactor contextual modifications out

* pylint
  • Loading branch information
DeNeutoy committed Nov 1, 2018
1 parent a4b885c commit e4f9131c0d75fdeab6ababe31e955a7d1d256de9
@@ -13,10 +13,10 @@

GRAMMAR_DICTIONARY = {}
GRAMMAR_DICTIONARY["statement"] = ['(query ws ";")', '(query ws)']
GRAMMAR_DICTIONARY["query"] = ['(ws select_core ws groupby_clause ws orderby_clause ws "LIMIT" ws number)',
GRAMMAR_DICTIONARY["query"] = ['(ws select_core ws groupby_clause ws orderby_clause ws limit)',
'(ws select_core ws groupby_clause ws orderby_clause)',
'(ws select_core ws groupby_clause ws "LIMIT" ws number)',
'(ws select_core ws orderby_clause ws ws "LIMIT" ws number)',
'(ws select_core ws groupby_clause ws limit)',
'(ws select_core ws orderby_clause ws limit)',
'(ws select_core ws groupby_clause)',
'(ws select_core ws orderby_clause)',
'(ws select_core)']
@@ -35,6 +35,7 @@
GRAMMAR_DICTIONARY["single_source"] = ['source_table', 'source_subq']
GRAMMAR_DICTIONARY["source_table"] = ['(table_name ws "AS" wsp name)', 'table_name']
GRAMMAR_DICTIONARY["source_subq"] = ['("(" ws query ws ")" ws "AS" ws name)', '("(" ws query ws ")")']
GRAMMAR_DICTIONARY["limit"] = ['("LIMIT" ws "1")', '("LIMIT" ws number)']

GRAMMAR_DICTIONARY["where_clause"] = ['(ws "WHERE" wsp expr ws where_conj)', '(ws "WHERE" wsp expr)']
GRAMMAR_DICTIONARY["where_conj"] = ['(ws "AND" wsp expr ws where_conj)', '(ws "AND" wsp expr)']
@@ -177,3 +178,60 @@ def update_grammar_to_be_variable_free(grammar_dictionary: Dict[str, List[str]])
# because now we don't have aliased tables, we don't need
# to recognise new variables.
del grammar_dictionary["name"]

def update_grammar_with_untyped_entities(grammar_dictionary: Dict[str, List[str]]) -> None:
"""
Variables can be treated as numbers or strings if their type can be inferred -
however, that can be difficult, so instead, we can just treat them all as values
and be a bit looser on the typing we allow in our grammar. Here we just remove
all references to number and string from the grammar, replacing them with value.
"""
grammar_dictionary["string_set_vals"] = ['(value ws "," ws string_set_vals)', 'value']
grammar_dictionary["value"].remove('string')
grammar_dictionary["value"].remove('number')
grammar_dictionary["limit"] = ['("LIMIT" ws "1")', '("LIMIT" ws value)']
grammar_dictionary["expr"][1] = '(value wsp "LIKE" wsp value)'
del grammar_dictionary["string"]
del grammar_dictionary["number"]


def update_grammar_values_with_variables(grammar_dictionary: Dict[str, List[str]],
prelinked_entities: Dict[str, Dict[str, str]]) -> None:

for variable, _ in prelinked_entities.items():
grammar_dictionary["value"] = [f'"\'{variable}\'"'] + grammar_dictionary["value"]


def update_grammar_numbers_and_strings_with_variables(grammar_dictionary: Dict[str, List[str]], # pylint: disable=invalid-name
prelinked_entities: Dict[str, Dict[str, str]],
columns: Dict[str, TableColumn]) -> None:
for variable, info in prelinked_entities.items():
variable_column = info["type"].upper()
matched_column = columns.get(variable_column, None)

if matched_column is not None:
# Try to infer the variable's type by matching it to a column in
# the database. If we can't, we just add it as a value.
if column_has_numeric_type(matched_column):
grammar_dictionary["number"] = [f'"\'{variable}\'"'] + grammar_dictionary["number"]
elif column_has_string_type(matched_column):
grammar_dictionary["string"] = [f'"\'{variable}\'"'] + grammar_dictionary["string"]
else:
grammar_dictionary["value"] = [f'"\'{variable}\'"'] + grammar_dictionary["value"]
# Otherwise, try to infer by looking at the actual value:
else:
try:
# This is what happens if you try and do type inference
# in a grammar which parses _strings_ in _Python_.
# We're just seeing if the python interpreter can convert
# to to a float - if it can, we assume it's a number.
float(info["text"])
is_numeric = True
except ValueError:
is_numeric = False
if is_numeric:
grammar_dictionary["number"] = [f'"\'{variable}\'"'] + grammar_dictionary["number"]
elif info["text"].replace(" ", "").isalpha():
grammar_dictionary["string"] = [f'"\'{variable}\'"'] + grammar_dictionary["string"]
else:
grammar_dictionary["value"] = [f'"\'{variable}\'"'] + grammar_dictionary["value"]
@@ -9,13 +9,14 @@
from allennlp.semparse.contexts.sql_context_utils import SqlVisitor
from allennlp.semparse.contexts.sql_context_utils import format_grammar_string, initialize_valid_actions
from allennlp.data.dataset_readers.dataset_utils.text2sql_utils import read_dataset_schema
from allennlp.data.dataset_readers.dataset_utils.text2sql_utils import column_has_numeric_type
from allennlp.data.dataset_readers.dataset_utils.text2sql_utils import column_has_string_type
from allennlp.semparse.contexts.text2sql_table_context import GRAMMAR_DICTIONARY
from allennlp.semparse.contexts.text2sql_table_context import update_grammar_with_table_values
from allennlp.semparse.contexts.text2sql_table_context import update_grammar_with_tables
from allennlp.semparse.contexts.text2sql_table_context import update_grammar_with_global_values
from allennlp.semparse.contexts.text2sql_table_context import update_grammar_to_be_variable_free
from allennlp.semparse.contexts.text2sql_table_context import update_grammar_with_untyped_entities
from allennlp.semparse.contexts.text2sql_table_context import update_grammar_values_with_variables
from allennlp.semparse.contexts.text2sql_table_context import update_grammar_numbers_and_strings_with_variables

class Text2SqlWorld:
"""
@@ -38,18 +39,23 @@ class Text2SqlWorld:
Denotes whether the data being parsed by the grammar is variable free.
If it is, the grammar is modified to be less expressive by removing
elements which are not necessary if the data is variable free.
use_untyped_entities : ``bool``, optional (default = False)
Whether or not to try to infer the types of prelinked variables.
If not, they are added as untyped values to the grammar instead.
"""
def __init__(self,
schema_path: str,
cursor: Cursor = None,
use_prelinked_entities: bool = True,
variable_free: bool = True) -> None:
variable_free: bool = True,
use_untyped_entities: bool = False) -> None:
self.cursor = cursor
self.schema = read_dataset_schema(schema_path)
self.columns = {column.name: column for table in self.schema.values() for column in table}
self.dataset_name = os.path.basename(schema_path).split("-")[0]
self.use_prelinked_entities = use_prelinked_entities
self.variable_free = variable_free
self.use_untyped_entities = use_untyped_entities

# NOTE: This base dictionary should not be modified.
self.base_grammar_dictionary = self._initialize_grammar_dictionary(deepcopy(GRAMMAR_DICTIONARY))
@@ -64,37 +70,13 @@ def get_action_sequence_and_all_actions(self,
"entities, but prelinked entities were passed.")
prelinked_entities = prelinked_entities or {}

if self.use_untyped_entities:
update_grammar_values_with_variables(grammar_with_context, prelinked_entities)
else:
update_grammar_numbers_and_strings_with_variables(grammar_with_context,
prelinked_entities,
self.columns)

for variable, info in prelinked_entities.items():
variable_column = info["type"].upper()
matched_column = self.columns.get(variable_column, None)

if matched_column is not None:
# Try to infer the variable's type by matching it to a column in
# the database. If we can't, we just add it as a value.
if column_has_numeric_type(matched_column):
grammar_with_context["number"] = [f'"\'{variable}\'"'] + grammar_with_context["number"]
elif column_has_string_type(matched_column):
grammar_with_context["string"] = [f'"\'{variable}\'"'] + grammar_with_context["string"]
else:
grammar_with_context["value"] = [f'"\'{variable}\'"'] + grammar_with_context["value"]
# Otherwise, try to infer by looking at the actual value:
else:
try:
# This is what happens if you try and do type inference
# in a grammar which parses _strings_ in _Python_.
# We're just seeing if the python interpreter can convert
# to to a float - if it can, we assume it's a number.
float(info["text"])
is_numeric = True
except ValueError:
is_numeric = False
if is_numeric:
grammar_with_context["number"] = [f'"\'{variable}\'"'] + grammar_with_context["number"]
elif info["text"].replace(" ", "").isalpha():
grammar_with_context["string"] = [f'"\'{variable}\'"'] + grammar_with_context["string"]
else:
grammar_with_context["value"] = [f'"\'{variable}\'"'] + grammar_with_context["value"]

grammar = Grammar(format_grammar_string(grammar_with_context))

@@ -130,6 +112,9 @@ def _initialize_grammar_dictionary(self, grammar_dictionary: Dict[str, List[str]
if self.variable_free:
update_grammar_to_be_variable_free(grammar_dictionary)

if self.use_untyped_entities:
update_grammar_with_untyped_entities(grammar_dictionary)

return grammar_dictionary

def is_global_rule(self, production_rule: str) -> bool:
@@ -17,6 +17,19 @@ def setUp(self):
self.database_path = str(self.FIXTURES_ROOT / "data" / "text2sql" / "restaurants.db")


def test_untyped_grammar_has_no_string_or_number_references(self):
world = Text2SqlWorld(self.schema, use_untyped_entities=True)
grammar_dictionary = world.base_grammar_dictionary

for key, value in grammar_dictionary.items():
assert key not in {"number", "string"}
# We don't check for string directly here because
# string_set is a valid non-terminal.
assert all(["number" not in production for production in value])
assert all(["string)" not in production for production in value])
assert all(["string " not in production for production in value])
assert all(["(string " not in production for production in value])

def test_world_modifies_unconstrained_grammar_correctly(self):
world = Text2SqlWorld(self.schema)
grammar_dictionary = world.base_grammar_dictionary

0 comments on commit e4f9131

Please sign in to comment.
You can’t perform that action at this time.