From 4e2a6b2ff1ee90d8c22d962ce96a48c821a083a8 Mon Sep 17 00:00:00 2001 From: iijima_satoshi Date: Tue, 17 Apr 2018 00:14:18 +0900 Subject: [PATCH 1/4] Enable query prefix syntax in bigquery interpreter --- bigquery/README.md | 85 ------------------- .../bigquery/BigQueryInterpreter.java | 19 ++++- .../main/resources/interpreter-setting.json | 13 +-- .../bigquery/BigQueryInterpreterTest.java | 11 ++- docs/interpreter/bigquery.md | 5 ++ 5 files changed, 35 insertions(+), 98 deletions(-) diff --git a/bigquery/README.md b/bigquery/README.md index fc097631a29..0dff5feb7c8 100644 --- a/bigquery/README.md +++ b/bigquery/README.md @@ -1,10 +1,6 @@ # Overview BigQuery interpreter for Apache Zeppelin -# Pre requisities -You can follow the instructions at [Apache Zeppelin on Dataproc](https://github.com/GoogleCloudPlatform/dataproc-initialization-actions/blob/master/apache-zeppelin/README.MD) to bring up Zeppelin on Google dataproc. -You could also install and bring up Zeppelin on Google compute Engine. - # Unit Tests BigQuery Unit tests are excluded as these tests depend on the BigQuery external service. This is because BigQuery does not have a local mock at this point. @@ -14,34 +10,6 @@ If you like to run these tests manually, please follow the following steps: * Copy the project ID that you created and add it to the property "projectId" in `resources/constants.json` * Run the command mvn -Dbigquery.text.exclude='' test -pl bigquery -am - -# Interpreter Configuration - -Configure the following properties during Interpreter creation. - - - - - - - - - - - - - - - - - - - - - - -
NameDefault ValueDescription
zeppelin.bigquery.project_id Google Project Id
zeppelin.bigquery.wait_time5000Query Timeout in Milliseconds
zeppelin.bigquery.max_no_of_rows100000Max result set size
- # Connection The Interpreter opens a connection with the BigQuery Service using the supplied Google project ID and the compute environment variables. @@ -51,59 +19,6 @@ The Interpreter opens a connection with the BigQuery Service using the supplied We have used the curated veneer version of the Java APIs versus [Idiomatic Java client] (https://github.com/GoogleCloudPlatform/gcloud-java/tree/master/gcloud-java-bigquery) to build the interpreter. This is mainly for usability reasons. -# Enabling the BigQuery Interpreter - -In a notebook, to enable the **BigQuery** interpreter, click the **Gear** icon and select **bigquery**. - -# Using the BigQuery Interpreter - -In a paragraph, use `%bigquery.sql` to select the **BigQuery** interpreter and then input SQL statements against your datasets stored in BigQuery. -You can use [BigQuery SQL Reference](https://cloud.google.com/bigquery/query-reference) to build your own SQL. - -For Example, SQL to query for top 10 departure delays across airports using the flights public dataset - -```bash -%bigquery.sql -SELECT departure_airport,count(case when departure_delay>0 then 1 else 0 end) as no_of_delays -FROM [bigquery-samples:airline_ontime_data.flights] -group by departure_airport -order by 2 desc -limit 10 -``` - -Another Example, SQL to query for most commonly used java packages from the github data hosted in BigQuery - -```bash -%bigquery.sql -SELECT - package, - COUNT(*) count -FROM ( - SELECT - REGEXP_EXTRACT(line, r' ([a-z0-9\._]*)\.') package, - id - FROM ( - SELECT - SPLIT(content, '\n') line, - id - FROM - [bigquery-public-data:github_repos.sample_contents] - WHERE - content CONTAINS 'import' - AND sample_path LIKE '%.java' - HAVING - LEFT(line, 6)='import' ) - GROUP BY - package, - id ) -GROUP BY - 1 -ORDER BY - count DESC -LIMIT - 40 -``` - # Sample Screenshot ![Zeppelin BigQuery](https://cloud.githubusercontent.com/assets/10060731/16938817/b9213ea0-4db6-11e6-8c3b-8149a0bdf874.png) diff --git a/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java b/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java index b56f63c1ae5..5c4c2cf190b 100644 --- a/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java +++ b/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java @@ -89,7 +89,7 @@ public class BigQueryInterpreter extends Interpreter { static final String PROJECT_ID = "zeppelin.bigquery.project_id"; static final String WAIT_TIME = "zeppelin.bigquery.wait_time"; static final String MAX_ROWS = "zeppelin.bigquery.max_no_of_rows"; - static final String LEGACY_SQL = "zeppelin.bigquery.use_legacy_sql"; + static final String SQL_DIALECT = "zeppelin.bigquery.sql_dialect"; private static String jobId = null; private static String projectId = null; @@ -226,8 +226,19 @@ private InterpreterResult executeSql(String sql) { String projId = getProperty(PROJECT_ID); long wTime = Long.parseLong(getProperty(WAIT_TIME)); long maxRows = Long.parseLong(getProperty(MAX_ROWS)); - String legacySql = getProperty(LEGACY_SQL); - boolean useLegacySql = legacySql == null ? true : Boolean.parseBoolean(legacySql); + String sqlDialect = getProperty(SQL_DIALECT, ""); + Boolean useLegacySql; + switch (sqlDialect) { + case "standardSQL": + useLegacySql = false; + break; + case "legacySQL": + useLegacySql = true; + break; + default: + // Enable query prefix like '#standardSQL' if specified + useLegacySql = null; + } Iterator pages; try { pages = run(sql, projId, wTime, maxRows, useLegacySql); @@ -247,7 +258,7 @@ private InterpreterResult executeSql(String sql) { //Function to run the SQL on bigQuery service public static Iterator run(final String queryString, - final String projId, final long wTime, final long maxRows, boolean useLegacySql) + final String projId, final long wTime, final long maxRows, Boolean useLegacySql) throws IOException { try { logger.info("Use legacy sql: {}", useLegacySql); diff --git a/bigquery/src/main/resources/interpreter-setting.json b/bigquery/src/main/resources/interpreter-setting.json index 8e92ee43620..1cc567e1aa8 100644 --- a/bigquery/src/main/resources/interpreter-setting.json +++ b/bigquery/src/main/resources/interpreter-setting.json @@ -22,14 +22,15 @@ "envName": null, "propertyName": "zeppelin.bigquery.max_no_of_rows", "defaultValue": "100000", - "description": "Maximum number of rows to fetch from BigQuery" + "description": "Maximum number of rows to fetch from BigQuery", + "type": "number" }, - "zeppelin.bigquery.use_legacy_sql": { + "zeppelin.bigquery.sql_dialect": { "envName": null, - "propertyName": "zeppelin.bigquery.use_legacy_sql", - "defaultValue": "true", - "description": "set true to use legacy sql", - "type": "checkbox" + "propertyName": "zeppelin.bigquery.sql_dialect", + "defaultValue": "", + "description": "Bigquery SQL dialect (standardSQL or legacySQL). If empty, query prefix like '#standardSQL' can be used.", + "type": "string" } }, "editor": { diff --git a/bigquery/src/test/java/org/apache/zeppelin/bigquery/BigQueryInterpreterTest.java b/bigquery/src/test/java/org/apache/zeppelin/bigquery/BigQueryInterpreterTest.java index 04676abd7e0..9dcd9f8c61d 100644 --- a/bigquery/src/test/java/org/apache/zeppelin/bigquery/BigQueryInterpreterTest.java +++ b/bigquery/src/test/java/org/apache/zeppelin/bigquery/BigQueryInterpreterTest.java @@ -74,6 +74,7 @@ public void setUp() throws Exception { p.setProperty("zeppelin.bigquery.project_id", constants.getProjectId()); p.setProperty("zeppelin.bigquery.wait_time", "5000"); p.setProperty("zeppelin.bigquery.max_no_of_rows", "100"); + p.setProperty("zeppelin.bigquery.sql_dialect", ""); intpGroup = new InterpreterGroup(); @@ -85,7 +86,6 @@ public void setUp() throws Exception { @Test public void sqlSuccess() { InterpreterResult ret = bqInterpreter.interpret(constants.getOne(), context); - assertEquals(InterpreterResult.Code.SUCCESS, ret.code()); assertEquals(ret.message().get(0).getType(), InterpreterResult.Type.TABLE); } @@ -93,14 +93,19 @@ public void sqlSuccess() { @Test public void badSqlSyntaxFails() { InterpreterResult ret = bqInterpreter.interpret(constants.getWrong(), context); - assertEquals(InterpreterResult.Code.ERROR, ret.code()); } + @Test + public void testWithQueryPrefix() { + InterpreterResult ret = bqInterpreter.interpret( + "#standardSQL\n WITH t AS (select 1) SELECT * FROM t", context); + assertEquals(InterpreterResult.Code.SUCCESS, ret.code()); + } + @Test public void testInterpreterOutputData() { InterpreterResult ret = bqInterpreter.interpret("SELECT 1 AS col1, 2 AS col2", context); - String[] lines = ret.message().get(0).getData().split("\\n"); assertEquals(2, lines.length); assertEquals("col1\tcol2", lines[0]); diff --git a/docs/interpreter/bigquery.md b/docs/interpreter/bigquery.md index 1b90f99357a..e3027997b57 100644 --- a/docs/interpreter/bigquery.md +++ b/docs/interpreter/bigquery.md @@ -48,6 +48,11 @@ limitations under the License. 100000 Max result set size + + zeppelin.bigquery.sql_dialect + + Bigquery SQL dialect (standardSQL or legacySQL). If empty, query prefix like '#standardSQL' can be used. + From 809f561cf5c9ad80e5416020c9bc1cefbbcc2651 Mon Sep 17 00:00:00 2001 From: iijima_satoshi Date: Sun, 22 Apr 2018 23:23:17 +0900 Subject: [PATCH 2/4] Address comments --- .../bigquery/BigQueryInterpreter.java | 20 ++++++++----------- docs/interpreter/bigquery.md | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java b/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java index 5c4c2cf190b..e15ebfb2be9 100644 --- a/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java +++ b/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java @@ -226,19 +226,15 @@ private InterpreterResult executeSql(String sql) { String projId = getProperty(PROJECT_ID); long wTime = Long.parseLong(getProperty(WAIT_TIME)); long maxRows = Long.parseLong(getProperty(MAX_ROWS)); - String sqlDialect = getProperty(SQL_DIALECT, ""); - Boolean useLegacySql; - switch (sqlDialect) { - case "standardSQL": - useLegacySql = false; - break; - case "legacySQL": - useLegacySql = true; - break; - default: - // Enable query prefix like '#standardSQL' if specified - useLegacySql = null; + String sqlDialect = getProperty(SQL_DIALECT, "").toLowerCase(); + // Query prefix like '#standardSQL' can be used in case of null + Boolean useLegacySql = null; + if (sqlDialect.contains("standard")) { + useLegacySql = false; + } else if (sqlDialect.contains("legacy")) { + useLegacySql = true; } + Iterator pages; try { pages = run(sql, projId, wTime, maxRows, useLegacySql); diff --git a/docs/interpreter/bigquery.md b/docs/interpreter/bigquery.md index e3027997b57..06aee372395 100644 --- a/docs/interpreter/bigquery.md +++ b/docs/interpreter/bigquery.md @@ -51,7 +51,7 @@ limitations under the License. zeppelin.bigquery.sql_dialect - Bigquery SQL dialect (standardSQL or legacySQL). If empty, query prefix like '#standardSQL' can be used. + Bigquery SQL dialect (standardSQL or legacySQL). If empty, [query prefix](https://cloud.google.com/bigquery/docs/reference/standard-sql/enabling-standard-sql#sql-prefix) like '#standardSQL' can be used. From f88cc429f39d6e282d60ce027ba19dc7bd8d9bf4 Mon Sep 17 00:00:00 2001 From: iijima_satoshi Date: Mon, 23 Apr 2018 17:09:33 +0900 Subject: [PATCH 3/4] Address comments --- .../zeppelin/bigquery/BigQueryInterpreter.java | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java b/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java index e15ebfb2be9..0973fda0df2 100644 --- a/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java +++ b/bigquery/src/main/java/org/apache/zeppelin/bigquery/BigQueryInterpreter.java @@ -227,14 +227,18 @@ private InterpreterResult executeSql(String sql) { long wTime = Long.parseLong(getProperty(WAIT_TIME)); long maxRows = Long.parseLong(getProperty(MAX_ROWS)); String sqlDialect = getProperty(SQL_DIALECT, "").toLowerCase(); - // Query prefix like '#standardSQL' can be used in case of null - Boolean useLegacySql = null; - if (sqlDialect.contains("standard")) { - useLegacySql = false; - } else if (sqlDialect.contains("legacy")) { - useLegacySql = true; + Boolean useLegacySql; + switch (sqlDialect) { + case "standardsql": + useLegacySql = false; + break; + case "legacysql": + useLegacySql = true; + break; + default: + // Enable query prefix like '#standardSQL' if specified + useLegacySql = null; } - Iterator pages; try { pages = run(sql, projId, wTime, maxRows, useLegacySql); From e5b1acd7f32c610b6918e0bce4c57e891f686499 Mon Sep 17 00:00:00 2001 From: iijima_satoshi Date: Fri, 27 Apr 2018 16:47:40 +0900 Subject: [PATCH 4/4] Address comments --- bigquery/src/main/resources/interpreter-setting.json | 2 +- docs/interpreter/bigquery.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigquery/src/main/resources/interpreter-setting.json b/bigquery/src/main/resources/interpreter-setting.json index 1cc567e1aa8..8023bed1522 100644 --- a/bigquery/src/main/resources/interpreter-setting.json +++ b/bigquery/src/main/resources/interpreter-setting.json @@ -29,7 +29,7 @@ "envName": null, "propertyName": "zeppelin.bigquery.sql_dialect", "defaultValue": "", - "description": "Bigquery SQL dialect (standardSQL or legacySQL). If empty, query prefix like '#standardSQL' can be used.", + "description": "BigQuery SQL dialect (standardSQL or legacySQL). If empty, query prefix like '#standardSQL' can be used.", "type": "string" } }, diff --git a/docs/interpreter/bigquery.md b/docs/interpreter/bigquery.md index 06aee372395..cdac762f6db 100644 --- a/docs/interpreter/bigquery.md +++ b/docs/interpreter/bigquery.md @@ -51,7 +51,7 @@ limitations under the License. zeppelin.bigquery.sql_dialect - Bigquery SQL dialect (standardSQL or legacySQL). If empty, [query prefix](https://cloud.google.com/bigquery/docs/reference/standard-sql/enabling-standard-sql#sql-prefix) like '#standardSQL' can be used. + BigQuery SQL dialect (standardSQL or legacySQL). If empty, [query prefix](https://cloud.google.com/bigquery/docs/reference/standard-sql/enabling-standard-sql#sql-prefix) like '#standardSQL' can be used.