diff --git a/table/queries/merged_reduced_scans.sql b/table/queries/merged_reduced_scans.sql index 96779bad..cd4cebd8 100644 --- a/table/queries/merged_reduced_scans.sql +++ b/table/queries/merged_reduced_scans.sql @@ -12,6 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Only process a year's worth of the hyperquack data for costs savings purposes +# TODO remove this once we are able to run appending queries instead. +# Not needed for satellite since we already output restricted data from the pipeline +DECLARE earliest_date DATE; +SET earliest_date = DATE_SUB(CURRENT_DATE, INTERVAL 1 YEAR); + CREATE TEMP FUNCTION AddOutcomeEmoji(outcome STRING) AS ( CASE WHEN STARTS_WITH(outcome, "setup/") THEN CONCAT("❔", outcome) @@ -26,6 +32,7 @@ CREATE TEMP FUNCTION AddOutcomeEmoji(outcome STRING) AS ( END ); + # BASE_DATASET and DERIVED_DATASET are reserved dataset placeholder names # which will be replaced when running the query @@ -47,15 +54,19 @@ AS ( WITH AllScans AS ( SELECT * EXCEPT (source), "DISCARD" AS source FROM `PROJECT_NAME.BASE_DATASET.discard_scan` + WHERE date >= earliest_date UNION ALL SELECT * EXCEPT (source), "ECHO" AS source FROM `PROJECT_NAME.BASE_DATASET.echo_scan` + WHERE date >= earliest_date UNION ALL SELECT * EXCEPT (source), "HTTP" AS source FROM `PROJECT_NAME.BASE_DATASET.http_scan` + WHERE date >= earliest_date UNION ALL SELECT * EXCEPT (source), "HTTPS" AS source FROM `PROJECT_NAME.BASE_DATASET.https_scan` + WHERE date >= earliest_date ), Grouped AS ( SELECT date,