fix: SHOW FUNCTIONS for Databricks #14252

betodealmeida · 2021-04-20T17:58:08Z

SUMMARY

The Databricks DB engine is based on the Hive DB engine spec, but the result from SHOW FUNCTIONS is slightly different. In Hive it returns a single column called tab_name, while in Databricks it's called function.

I changed the get_function_names method to allow derived classes to specify a different column name, and also made it more resilient by accepting any column name when only one column is returned.

BEFORE/AFTER SCREENSHOTS OR ANIMATED GIF

N/A

TEST PLAN

I tested the response from both Hive and Databricks, and also tested that it works when the results have a single column with a name different than the one expected (in that case we still log an error, but return the names).

Response from Databricks with this PR:

{
  "function_names": [
    "!", 
    "!=", 
    "%", 
    "&", 
    "*", 
    "+", 
    "-", 
    "/", 
    "<", 
    "<=", 
    "<=>", 
    "<>", 
    "=", 
    "==", 
    ">", 
    ">=", 
    "^", 
    "abs", 
    "acos", 
    "acosh", 
    "add_months", 
    "aggregate", 
    "and", 
    "any", 
    "approx_count_distinct", 
    "approx_percentile", 
    "array", 
    "array_contains", 
    "array_distinct", 
    "array_except", 
    "array_intersect", 
    "array_join", 
    "array_max", 
    "array_min", 
    "array_position", 
    "array_remove", 
    "array_repeat", 
    "array_sort", 
    "array_union", 
    "arrays_overlap", 
    "arrays_zip", 
    "ascii", 
    "asin", 
    "asinh", 
    "assert_true", 
    "atan", 
    "atan2", 
    "atanh", 
    "avg", 
    "base64", 
    "between", 
    "bigint", 
    "bin", 
    "binary", 
    "bit_and", 
    "bit_count", 
    "bit_length", 
    "bit_or", 
    "bit_xor", 
    "bool_and", 
    "bool_or", 
    "boolean", 
    "bround", 
    "cardinality", 
    "case", 
    "cast", 
    "cbrt", 
    "ceil", 
    "ceiling", 
    "char", 
    "char_length", 
    "character_length", 
    "charindex", 
    "chr", 
    "coalesce", 
    "collect_list", 
    "collect_set", 
    "concat", 
    "concat_ws", 
    "conv", 
    "corr", 
    "cos", 
    "cosh", 
    "cot", 
    "count", 
    "count_if", 
    "count_min_sketch", 
    "covar_pop", 
    "covar_samp", 
    "crc32", 
    "cube", 
    "cume_dist", 
    "current_database", 
    "current_date", 
    "current_timestamp", 
    "current_user", 
    "date", 
    "date_add", 
    "date_format", 
    "date_part", 
    "date_sub", 
    "date_trunc", 
    "datediff", 
    "day", 
    "dayofmonth", 
    "dayofweek", 
    "dayofyear", 
    "decimal", 
    "decode", 
    "degrees", 
    "dense_rank", 
    "div", 
    "double", 
    "e", 
    "element_at", 
    "elt", 
    "encode", 
    "every", 
    "exists", 
    "exp", 
    "explode", 
    "explode_outer", 
    "expm1", 
    "extract", 
    "factorial", 
    "filter", 
    "find_in_set", 
    "first", 
    "first_value", 
    "flatten", 
    "float", 
    "floor", 
    "forall", 
    "format_number", 
    "format_string", 
    "from_csv", 
    "from_json", 
    "from_unixtime", 
    "from_utc_timestamp", 
    "get_json_object", 
    "greatest", 
    "grouping", 
    "grouping_id", 
    "hash", 
    "hex", 
    "hour", 
    "hypot", 
    "if", 
    "iff", 
    "ifnull", 
    "in", 
    "initcap", 
    "inline", 
    "inline_outer", 
    "input_file_block_length", 
    "input_file_block_start", 
    "input_file_name", 
    "instr", 
    "int", 
    "is_member", 
    "isnan", 
    "isnotnull", 
    "isnull", 
    "java_method", 
    "json_tuple", 
    "kurtosis", 
    "lag", 
    "last", 
    "last_day", 
    "last_value", 
    "lcase", 
    "lead", 
    "least", 
    "left", 
    "length", 
    "levenshtein", 
    "like", 
    "ln", 
    "locate", 
    "log", 
    "log10", 
    "log1p", 
    "log2", 
    "lower", 
    "lpad", 
    "ltrim", 
    "make_date", 
    "make_interval", 
    "make_timestamp", 
    "map", 
    "map_concat", 
    "map_entries", 
    "map_filter", 
    "map_from_arrays", 
    "map_from_entries", 
    "map_keys", 
    "map_values", 
    "map_zip_with", 
    "max", 
    "max_by", 
    "md5", 
    "mean", 
    "min", 
    "min_by", 
    "minute", 
    "mod", 
    "monotonically_increasing_id", 
    "month", 
    "months_between", 
    "named_struct", 
    "nanvl", 
    "negative", 
    "next_day", 
    "not", 
    "now", 
    "ntile", 
    "nullif", 
    "nvl", 
    "nvl2", 
    "octet_length", 
    "or", 
    "overlay", 
    "parse_url", 
    "percent_rank", 
    "percentile", 
    "percentile_approx", 
    "pi", 
    "pmod", 
    "posexplode", 
    "posexplode_outer", 
    "position", 
    "positive", 
    "pow", 
    "power", 
    "printf", 
    "quarter", 
    "radians", 
    "rand", 
    "randn", 
    "random", 
    "rank", 
    "reduce", 
    "reflect", 
    "regexp_extract", 
    "regexp_replace", 
    "repeat", 
    "replace", 
    "reverse", 
    "right", 
    "rint", 
    "rlike", 
    "rollup", 
    "round", 
    "row_number", 
    "rpad", 
    "rtrim", 
    "schema_of_csv", 
    "schema_of_json", 
    "second", 
    "sentences", 
    "sequence", 
    "sha", 
    "sha1", 
    "sha2", 
    "shiftleft", 
    "shiftright", 
    "shiftrightunsigned", 
    "shuffle", 
    "sign", 
    "signum", 
    "sin", 
    "sinh", 
    "size", 
    "skewness", 
    "slice", 
    "smallint", 
    "some", 
    "sort_array", 
    "soundex", 
    "space", 
    "spark_partition_id", 
    "split", 
    "sql_dw_from_utc_timestamp", 
    "sql_dw_to_utc_timestamp", 
    "sqrt", 
    "stack", 
    "std", 
    "stddev", 
    "stddev_pop", 
    "stddev_samp", 
    "str_to_map", 
    "string", 
    "struct", 
    "substr", 
    "substring", 
    "substring_index", 
    "sum", 
    "tan", 
    "tanh", 
    "timestamp", 
    "tinyint", 
    "to_csv", 
    "to_date", 
    "to_json", 
    "to_timestamp", 
    "to_unix_timestamp", 
    "to_utc_timestamp", 
    "transform", 
    "transform_keys", 
    "transform_values", 
    "translate", 
    "trim", 
    "trunc", 
    "typeof", 
    "ucase", 
    "unbase64", 
    "unhex", 
    "unix_timestamp", 
    "upper", 
    "uuid", 
    "var_pop", 
    "var_samp", 
    "variance", 
    "version", 
    "weekday", 
    "weekofyear", 
    "when", 
    "width_bucket", 
    "window", 
    "xpath", 
    "xpath_boolean", 
    "xpath_double", 
    "xpath_float", 
    "xpath_int", 
    "xpath_long", 
    "xpath_number", 
    "xpath_short", 
    "xpath_string", 
    "xxhash64", 
    "year", 
    "zip_with", 
    "|", 
    "~"
  ]
}

Response from Hive:

{
  "function_names": [
    "!", 
    "!=", 
    "$sum0", 
    "%", 
    "&", 
    "*", 
    "+", 
    "-", 
    "/", 
    "<", 
    "<=", 
    "<=>", 
    "<>", 
    "=", 
    "==", 
    ">", 
    ">=", 
    "^", 
    "abs", 
    "acos", 
    "add_months", 
    "aes_decrypt", 
    "aes_encrypt", 
    "and", 
    "array", 
    "array_contains", 
    "ascii", 
    "asin", 
    "assert_true", 
    "atan", 
    "avg", 
    "base64", 
    "between", 
    "bin", 
    "bloom_filter", 
    "bround", 
    "cardinality_violation", 
    "case", 
    "cbrt", 
    "ceil", 
    "ceiling", 
    "char_length", 
    "character_length", 
    "chr", 
    "coalesce", 
    "collect_list", 
    "collect_set", 
    "compute_stats", 
    "concat", 
    "concat_ws", 
    "context_ngrams", 
    "conv", 
    "corr", 
    "cos", 
    "count", 
    "covar_pop", 
    "covar_samp", 
    "crc32", 
    "create_union", 
    "cume_dist", 
    "current_database", 
    "current_date", 
    "current_timestamp", 
    "current_user", 
    "date_add", 
    "date_format", 
    "date_sub", 
    "datediff", 
    "day", 
    "dayofmonth", 
    "dayofweek", 
    "decode", 
    "degrees", 
    "dense_rank", 
    "div", 
    "e", 
    "elt", 
    "encode", 
    "ewah_bitmap", 
    "ewah_bitmap_and", 
    "ewah_bitmap_empty", 
    "ewah_bitmap_or", 
    "exp", 
    "explode", 
    "extract_union", 
    "factorial", 
    "field", 
    "find_in_set", 
    "first_value", 
    "floor", 
    "floor_day", 
    "floor_hour", 
    "floor_minute", 
    "floor_month", 
    "floor_quarter", 
    "floor_second", 
    "floor_week", 
    "floor_year", 
    "format_number", 
    "from_unixtime", 
    "from_utc_timestamp", 
    "get_json_object", 
    "get_splits", 
    "greatest", 
    "grouping", 
    "hash", 
    "hex", 
    "histogram_numeric", 
    "hour", 
    "if", 
    "in", 
    "in_bloom_filter", 
    "in_file", 
    "index", 
    "initcap", 
    "inline", 
    "instr", 
    "internal_interval", 
    "isnotnull", 
    "isnull", 
    "java_method", 
    "json_tuple", 
    "lag", 
    "last_day", 
    "last_value", 
    "lcase", 
    "lead", 
    "least", 
    "length", 
    "levenshtein", 
    "like", 
    "ln", 
    "locate", 
    "log", 
    "log10", 
    "log2", 
    "logged_in_user", 
    "lower", 
    "lpad", 
    "ltrim", 
    "map", 
    "map_keys", 
    "map_values", 
    "mask", 
    "mask_first_n", 
    "mask_hash", 
    "mask_last_n", 
    "mask_show_first_n", 
    "mask_show_last_n", 
    "matchpath", 
    "max", 
    "md5", 
    "min", 
    "minute", 
    "mod", 
    "month", 
    "months_between", 
    "named_struct", 
    "negative", 
    "next_day", 
    "ngrams", 
    "noop", 
    "noopstreaming", 
    "noopwithmap", 
    "noopwithmapstreaming", 
    "not", 
    "ntile", 
    "nullif", 
    "nvl", 
    "octet_length", 
    "or", 
    "parse_url", 
    "parse_url_tuple", 
    "percent_rank", 
    "percentile", 
    "percentile_approx", 
    "pi", 
    "pmod", 
    "posexplode", 
    "positive", 
    "pow", 
    "power", 
    "printf", 
    "quarter", 
    "radians", 
    "rand", 
    "rank", 
    "reflect", 
    "reflect2", 
    "regexp", 
    "regexp_extract", 
    "regexp_replace", 
    "regr_avgx", 
    "regr_avgy", 
    "regr_count", 
    "regr_intercept", 
    "regr_r2", 
    "regr_slope", 
    "regr_sxx", 
    "regr_sxy", 
    "regr_syy", 
    "repeat", 
    "replace", 
    "replicate_rows", 
    "reverse", 
    "rlike", 
    "round", 
    "row_number", 
    "rpad", 
    "rtrim", 
    "second", 
    "sentences", 
    "sha", 
    "sha1", 
    "sha2", 
    "shiftleft", 
    "shiftright", 
    "shiftrightunsigned", 
    "sign", 
    "sin", 
    "size", 
    "sort_array", 
    "sort_array_by", 
    "soundex", 
    "space", 
    "split", 
    "sq_count_check", 
    "sqrt", 
    "stack", 
    "std", 
    "stddev", 
    "stddev_pop", 
    "stddev_samp", 
    "str_to_map", 
    "struct", 
    "substr", 
    "substring", 
    "substring_index", 
    "sum", 
    "tan", 
    "to_date", 
    "to_unix_timestamp", 
    "to_utc_timestamp", 
    "translate", 
    "trim", 
    "trunc", 
    "ucase", 
    "unbase64", 
    "unhex", 
    "unix_timestamp", 
    "upper", 
    "uuid", 
    "var_pop", 
    "var_samp", 
    "variance", 
    "version", 
    "weekofyear", 
    "when", 
    "windowingtablefunction", 
    "xpath", 
    "xpath_boolean", 
    "xpath_double", 
    "xpath_float", 
    "xpath_int", 
    "xpath_long", 
    "xpath_number", 
    "xpath_short", 
    "xpath_string", 
    "year", 
    "|", 
    "~"
  ]
}

ADDITIONAL INFORMATION

Has associated issue:
Changes UI
Includes DB Migration (follow approval process in SIP-59)
- Migration is atomic, supports rollback & is backwards-compatible
- Confirm DB migration upgrade and downgrade tested
- Runtime estimates and downtime expectations provided
Introduces new feature or API
Removes existing feature or API

codecov · 2021-04-20T19:09:33Z

Codecov Report

Merging #14252 (bb7b6c4) into master (13d4902) will decrease coverage by 0.07%.
The diff coverage is 20.00%.

❗ Current head bb7b6c4 differs from pull request most recent head 4ff3c87. Consider uploading reports for the commit 4ff3c87 to get more accurate results

@@            Coverage Diff             @@
##           master   #14252      +/-   ##
==========================================
- Coverage   76.96%   76.89%   -0.08%     
==========================================
  Files         952      952              
  Lines       48043    48052       +9     
  Branches     5978     5978              
==========================================
- Hits        36977    36949      -28     
- Misses      10864    10901      +37     
  Partials      202      202

Flag	Coverage Δ
hive	`80.41% <20.00%> (-0.03%)`	⬇️
mysql	`80.70% <20.00%> (-0.03%)`	⬇️
postgres	`80.72% <20.00%> (?)`
presto	`?`
python	`81.12% <20.00%> (-0.14%)`	⬇️
sqlite	`80.34% <20.00%> (-0.03%)`	⬇️

Flags with carried forward coverage won't be shown. Click here to find out more.

Impacted Files	Coverage Δ
superset/db_engine_specs/hive.py	`88.51% <11.11%> (-2.33%)`	⬇️
superset/db_engine_specs/databricks.py	`100.00% <100.00%> (ø)`
superset/db_engine_specs/presto.py	`84.42% <0.00%> (-5.90%)`	⬇️
superset/connectors/sqla/models.py	`88.61% <0.00%> (-1.46%)`	⬇️
superset/models/core.py	`88.85% <0.00%> (-0.28%)`	⬇️
superset/views/base_api.py	`98.28% <0.00%> (+0.42%)`	⬆️
superset/db_engine_specs/postgres.py	`96.84% <0.00%> (+1.05%)`	⬆️
superset/sql_validators/postgres.py	`100.00% <0.00%> (+50.00%)`	⬆️

Continue to review full report at Codecov.

Legend - Click here to learn more
Δ = absolute <relative> (impact), ø = not affected, ? = missing data
Powered by Codecov. Last update 13d4902...4ff3c87. Read the comment docs.

eschutho · 2021-04-21T17:31:02Z

superset/db_engine_specs/hive.py

+            return df[cls._show_functions_column].tolist()
+
+        columns = df.columns.values.tolist()
+        logger.error(


When this function is called in core.py the exceptions are caught and logged there. What's the philosophy in this case? Should we catch/log early or let errors bubble up and be caught in one place?

In this case I wanted to log here where we have more context, so that we can quickly fix the problem when we see it in the logs. Also, even if the result is not as expected we can still try our best and use the results if they have a single column.

eschutho

LGTM

WIP

4ff3c87

pull-request-size bot added the size/S label Apr 20, 2021

betodealmeida changed the title ~~WIP~~ fix: SHOW FUNCTIONS for Databricks Apr 20, 2021

eschutho reviewed Apr 21, 2021

View reviewed changes

eschutho approved these changes Apr 23, 2021

View reviewed changes

hughhhh approved these changes Apr 23, 2021

View reviewed changes

betodealmeida merged commit 919fd49 into apache:master Apr 23, 2021

QAlexBall pushed a commit to QAlexBall/superset that referenced this pull request Dec 29, 2021

WIP (apache#14252)

16f62c9

mistercrunch added 🏷️ bot A label used by `supersetbot` to keep track of which PR where auto-tagged with release labels 🚢 1.2.0 labels Mar 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix: SHOW FUNCTIONS for Databricks #14252

fix: SHOW FUNCTIONS for Databricks #14252

betodealmeida commented Apr 20, 2021

codecov bot commented Apr 20, 2021 •

edited

eschutho Apr 21, 2021

betodealmeida Apr 22, 2021

eschutho left a comment

fix: SHOW FUNCTIONS for Databricks #14252

fix: SHOW FUNCTIONS for Databricks #14252

Conversation

betodealmeida commented Apr 20, 2021

SUMMARY

BEFORE/AFTER SCREENSHOTS OR ANIMATED GIF

TEST PLAN

ADDITIONAL INFORMATION

codecov bot commented Apr 20, 2021 • edited

Codecov Report

eschutho Apr 21, 2021

Choose a reason for hiding this comment

betodealmeida Apr 22, 2021

Choose a reason for hiding this comment

eschutho left a comment

Choose a reason for hiding this comment

codecov bot commented Apr 20, 2021 •

edited