In [1]:
%load_ext google.cloud.bigquery

# Scripting

## A sequence of SQL statements, seperated by ;

In [2]:
%%bigquery

# Create table typical_trip

CREATE OR REPLACE TABLE
  dataflow-templates-327714.bigquery_examples.typical_trip AS
SELECT
  start_station_name,
  end_station_name,
  APPROX_QUANTILES(duration, 10)[
OFFSET
  (5)] AS typical_duration,
  COUNT(*) AS num_trips
FROM
  dataflow-templates-327714.bigquery_examples.cycle_hire
GROUP BY
  start_station_name,
  end_station_name;

# Create table unusual days

CREATE OR REPLACE TABLE
  dataflow-templates-327714.bigquery_examples.unusual_days AS
    SELECT EXTRACT(DATE FROM start_date) AS trip_date,
    APPROX_QUANTILES(duration / typical_duration, 10)[OFFSET(5)] AS ratio,
    COUNT(*) AS num_trips_on_day
    FROM dataflow-templates-327714.bigquery_examples.cycle_hire as hire,
    bigquery_examples.typical_trip AS trip
WHERE
    hire.start_station_name = trip.start_station_name
    AND hire.end_station_name = trip.end_station_name
    AND num_trips > 10


Executing query with job ID: 65ba719e-27d3-4bc3-89ce-b9f6a3072991
Query executing: 10.71s


ERROR:
 400 Query error: SELECT list expression references column start_date which is neither grouped nor aggregated at [22:30]

Location: EU
Job ID: 65ba719e-27d3-4bc3-89ce-b9f6a3072991



## A simple script starts with variables

#### Use case: Find the return stations with longest duration rentals from waterloo

- Here we declare variables to:
   - Represent the pattern
   - Minimum number of trips threshold 
   - Array of strings to store intermediate data related to stations that fit the pattern
- **Variables can be any type supported by BigQuery**

In [14]:
%%bigquery

-- variables
DECLARE
  PATTERN STRING DEFAULT '%Waterloo%';
DECLARE
  stations ARRAY<STRING>;
DECLARE
  MIN_TRIPS_THRESH INT64 DEFAULT 100;
SET
  stations = (
  SELECT
    ARRAY_AGG(name)
  FROM
    dataflow-templates-327714.bigquery_examples.cycle_stations
  WHERE
    name LIKE PATTERN );
    
SELECT
  start_station_name,
  end_station_name,
  AVG(duration) AS avg_duration,
  COUNT(duration) AS num_trips
FROM
  dataflow-templates-327714.bigquery_examples.cycle_hire,
  UNNEST(stations) AS station
WHERE
  start_station_name = station
GROUP BY
  start_station_name,
  end_station_name
HAVING
  num_trips > MIN_TRIPS_THRESH
ORDER BY
  avg_duration DESC
LIMIT
  5;

Query complete after 0.00s: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 351.02query/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.18rows/s]


Unnamed: 0,start_station_name,end_station_name,avg_duration,num_trips
0,"Lambeth North Station, Waterloo","Binfield Road, Stockwell",7204.0,150
1,"Baylis Road, Waterloo","Binfield Road, Stockwell",6520.833333,288
2,"Lower Marsh, Waterloo","Caldwell Street, Stockwell",6420.685714,175
3,"Waterloo Station 2, Waterloo","Waterloo Station 1, Waterloo",5456.953125,256
4,"Waterloo Place, St. James's","Tower Gardens , Tower",4421.320755,212


## Looping

Scripting also supports control flows through IF conditions and a variety of looping primitives. Below example is a simple while loop as a function of the MIN_TRIPS_THRESH variable.

In [16]:
%%bigquery

-- variables
DECLARE
  PATTERN STRING DEFAULT '%Waterloo%';
DECLARE
  stations ARRAY<STRING>;
DECLARE
  MIN_TRIPS_THRESH INT64 DEFAULT 100;
SET
  stations = (
  SELECT
    ARRAY_AGG(name)
  FROM
    dataflow-templates-327714.bigquery_examples.cycle_stations
  WHERE
    name LIKE PATTERN );

WHILE MIN_TRIPS_THRESH < 1000 DO
SELECT
  start_station_name,
  end_station_name,
  AVG(duration) AS avg_duration,
  COUNT(duration) AS num_trips
FROM
  dataflow-templates-327714.bigquery_examples.cycle_hire,
  UNNEST(stations) AS station
WHERE
  start_station_name = station
GROUP BY
  start_station_name,
  end_station_name
HAVING
  num_trips > MIN_TRIPS_THRESH
ORDER BY
  avg_duration DESC
LIMIT
  5;

SET MIN_TRIPS_THRESH = MIN_TRIPS_THRESH * 2;
END WHILE;

Query complete after 0.00s: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 470.11query/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.28rows/s]


Unnamed: 0,start_station_name,end_station_name,avg_duration,num_trips
0,"Waterloo Station 1, Waterloo","Waterloo Station 1, Waterloo",3792.509091,825
1,"Waterloo Place, St. James's","Waterloo Place, St. James's",3071.813691,2834
2,"Waterloo Place, St. James's","Wellington Arch, Hyde Park",1452.331386,1883
3,"Waterloo Station 1, Waterloo","Tower Gardens , Tower",1298.05638,1348
4,"Waterloo Station 1, Waterloo","Brushfield Street, Liverpool Street",1233.096271,1153


### More primitive loops

In [19]:
%%bigquery

-- variable
DECLARE
  MIN_TRIPS_THRESH INT64 DEFAULT 100;
  
-- primitive (classic) loop
LOOP
IF MIN_TRIPS_THRESH >= 1000 THEN
    BREAK;
END IF;

SELECT MIN_TRIPS_THRESH;
SET MIN_TRIPS_THRESH = MIN_TRIPS_THRESH * 2;
END LOOP;

Query complete after 0.00s: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 455.41query/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.53s/rows]


Unnamed: 0,MIN_TRIPS_THRESH
0,800


## Exceptions

### Catching exceptions

In [21]:
%%bigquery

BEGIN
DECLARE
  stations ARRAY<INT64>;
SET
  stations = (
  SELECT
    ARRAY_AGG(CAST(name AS INT64)) names
  FROM
    dataflow-templates-327714.bigquery_examples.cycle_stations
  WHERE
    name LIKE '%Kings%'); EXCEPTION
    WHEN ERROR THEN SELECT @@error.message AS msg, @@error.stack_trace AS trace;
END
  ;

Query complete after 0.00s: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 477.11query/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.17s/rows]


Unnamed: 0,msg,trace
0,"Query error: Bad int64 value: Kingsway, Covent...","[{'line': 4, 'column': 1, 'filename': None, 'l..."


### Raising exceptions

In [None]:
%%bigquery

BEGIN
DECLARE
  stations ARRAY<STRING>;
SET
  stations = (
  SELECT
    ARRAY_AGG(name) names
  FROM
    dataflow-templates-327714.bigquery_examples.cycle_stations
  WHERE
    name LIKE '%Kings%'); -- Replace with another value to trigger exception
    
IF ARRAY_LENGTH(stations) = 0 THEN
   RAISE USING MESSAGE = "No stations matched";
END IF;
  
   EXCEPTION
    WHEN ERROR THEN SELECT @@error.message AS msg, @@error.stack_trace AS trace;
END
  ;

## Dynamic SQL

It is possible to create a string dynamically within a script and execute it using EXECUTE IMMEDIATE

In [None]:
TBD