In [None]:
import os 
import pandas as pd

In [None]:
# Some of the columns we will look at can be quite wide, but it's good to get an idea of what they contain
print(pd.get_option('max_colwidth'))
pd.set_option('max_colwidth',500)

# Page types

Edit the query to look at the dates you care about

We want to know what page types different pages are because we care about navigation elements clicked **from a content page** and the total number of journeys that contain **only content pages and related links**.

Initially we wanted to use [User journey supertype](https://docs.publishing.service.gov.uk/document-types/user_journey_document_supertype.html) from [custom dimensions](https://gov-uk.atlassian.net/wiki/spaces/GOVUK/pages/23855552/Analytics+on+GOV.UK#AnalyticsonGOV.UK-customDimensionsCustomdimensions), where each page is either classified as 'finding' or 'thing'. Unfortunately this dimension is poorly populated in BigQuery due to someone disabling it, and occasionally the same page path and page title can be reported as both a 'finding' and a 'thing' at different times (e.g. the GOV.UK homepage).

The next option is using 'Format' (document type) from [custom dimensions](https://gov-uk.atlassian.net/wiki/spaces/GOVUK/pages/23855552/Analytics+on+GOV.UK#AnalyticsonGOV.UK-customDimensionsCustomdimensions), which is a lot better populated but a lot more granular, you can find a sample of a day's data with pageviews from GA [here](https://docs.google.com/spreadsheets/d/1-jen8DbRgvmvF9aYapmwsFAe0ncHeq5y-ks75iyCvVU/edit#gid=177174931).

*NB - a data dump from the content API is probably the most reliable source for a mapping of page paths to document types and then to finding/thing, however there's a bit of outlay in understanding how that data can be accessed and how it is structured so a BigQuery query is most appropriate for a first stab at this.*

**TODO** one query to get pagePath, format (or map this to finding/thing), content ID, where content_id != '00000000-0000-0000-0000-000000000000'

In [None]:
ProjectID = 'govuk-bigquery-analytics'
KEY_DIR = os.getenv("BQ_KEY_DIR")
key_file_path = os.path.join(KEY_DIR, os.listdir(KEY_DIR)[0])

In [None]:
QUERY = """
    SELECT 
    pagePath,
    MAX(IF(document_type IN ('document_collection',
      'finder',
      'homepage',
      'license_finder',
      'mainstream_browse_page',
      'organisation',
      'search',
      'service_manual_homepage',
      'service_manual_topic',
      'services_and_information',
      'taxon',
      'topic',
      'topical_event'),1,0)) AS is_finding
    FROM
      (SELECT 
          (
        SELECT
          value
        FROM
          hits.customDimensions
        WHERE
          index=4) AS content_id,
          hits.page.pagePath, 
        (
        SELECT
          value
        FROM
          hits.customDimensions
        WHERE
          index=2) AS document_type
      FROM
        `govuk-bigquery-analytics.87773428.ga_sessions_*` AS sessions
      CROSS JOIN
        UNNEST(sessions.hits) AS hits
         WHERE _TABLE_SUFFIX BETWEEN '20190214'
     AND '20190218'
      )
    WHERE
      content_id != '00000000-0000-0000-0000-000000000000'
      AND content_id != '[object Object]'
      AND content_id IS NOT NULL
    GROUP BY 1
"""

In [None]:
df_finding_thing = pd.io.gbq.read_gbq(QUERY,
                           project_id=ProjectID,
                           reauth=False,
                           # verbose=True,
                           private_key=key_file_path,
                           dialect='standard')

In [None]:
df_finding_thing.to_csv('../../data/raw_bq_extract/document_types_20190214_20190218.csv.gz',
             sep="\t", index=False, compression='gzip')

In [None]:
# df_finding_thing = pd.read_csv(
#     '../../data/raw_bq_extract/document_types.csv.gz',
#              sep="\t", compression='gzip')