In [9]:
import pandas as pd
import os
import logging
import sys

### Load environment vars and directories

In [6]:
KEY_DIR = os.path.join(os.getenv("DOCUMENTS"), "govuk-network-data", "key")
KEY_PATH = os.path.join(KEY_DIR, os.listdir(KEY_DIR)[0])
PROJECT_ID = "govuk-bigquery-analytics"

#### Logging for `pandas_gbq`

In [10]:
logger = logging.getLogger('pandas_gbq')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))

### Extract page-hit only user journeys for February 11 to 18
8.8 GB

In [7]:
query = """SELECT
  COUNT(*) AS Occurrences,
  PageSeq_Length,
  PageSequence
FROM (
  SELECT
    *
  FROM (
    SELECT
      CONCAT(fullVisitorId,"-",CAST(visitId AS STRING),"-",CAST(visitNumber AS STRING)) AS sessionId,
      STRING_AGG(IF(htype = 'PAGE',
          pagePath,
          NULL),">>") OVER (PARTITION BY fullVisitorId, visitId, visitStartTime ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSequence,
      SUM(IF(htype='PAGE',
          1,
          0)) OVER (PARTITION BY fullVisitorId, visitId, visitStartTime ORDER BY hitNumber ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS PageSeq_Length
    FROM (
      SELECT
        fullVisitorId,
        visitId,
        visitNumber,
        visitStartTime,
        hits.page.pagePath AS pagePath,
        hits.hitNumber AS hitNumber,
        hits.type AS htype
      FROM
        `govuk-bigquery-analytics.87773428.ga_sessions_*` AS sessions
      CROSS JOIN
        UNNEST(sessions.hits) AS hits 
       WHERE _TABLE_SUFFIX BETWEEN '20190211' AND '20190218' ) )
  WHERE
    PageSeq_Length >1
  GROUP BY
    sessionId,
    PageSequence,
    PageSeq_Length)
GROUP BY
  PageSequence,
  PageSeq_Length"""

### Extract data from BigQuery

In [11]:
df_in = pd.read_gbq(query,
                           project_id=PROJECT_ID,
                           reauth=False,
                           verbose=True,
                           private_key=KEY_PATH,
                           dialect="standard")

Requesting query... 
Query running...
Job ID: 9066f954-1dd6-40c6-af0a-84e394001b36
  Elapsed 8.19 s. Waiting...
  Elapsed 9.59 s. Waiting...
  Elapsed 10.92 s. Waiting...
  Elapsed 12.26 s. Waiting...
  Elapsed 13.59 s. Waiting...
  Elapsed 14.92 s. Waiting...
  Elapsed 16.25 s. Waiting...
  Elapsed 17.56 s. Waiting...
  Elapsed 18.92 s. Waiting...
  Elapsed 20.24 s. Waiting...
  Elapsed 21.53 s. Waiting...
  Elapsed 22.9 s. Waiting...
  Elapsed 24.24 s. Waiting...
  Elapsed 25.57 s. Waiting...
  Elapsed 26.85 s. Waiting...
  Elapsed 28.22 s. Waiting...
  Elapsed 29.56 s. Waiting...
  Elapsed 30.89 s. Waiting...
  Elapsed 32.03 s. Waiting...
  Elapsed 33.35 s. Waiting...
  Elapsed 34.68 s. Waiting...
  Elapsed 35.98 s. Waiting...
  Elapsed 37.25 s. Waiting...
  Elapsed 38.55 s. Waiting...
  Elapsed 39.9 s. Waiting...
  Elapsed 41.19 s. Waiting...
  Elapsed 42.48 s. Waiting...
  Elapsed 43.8 s. Waiting...
  Elapsed 45.09 s. Waiting...
  Elapsed 46.46 s. Waiting...
  Elapsed 47.73 s. Wai

In [12]:
df_in.shape

(5717481, 3)

In [13]:
df_in.head()

Unnamed: 0,Occurrences,PageSeq_Length,PageSequence
0,1,35,/topic/help-british-nationals-overseas>>/topic...
1,1,22,/child-benefit/eligibility>>/child-benefit-tax...
2,1,26,/future-pension-centre>>/check-state-pension>>...
3,1,25,/uk-visa-sponsorship-employers>>/uk-visa-spons...
4,1,44,/bankruptcy/restrictions>>/bankruptcy>>/search...


### Explore occurrences stats

In [16]:
df_in.Occurrences.describe()

count    5.717481e+06
mean     2.554230e+00
std      2.641301e+02
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      4.953760e+05
Name: Occurrences, dtype: float64

In [17]:
all_occ = df_in.Occurrences.sum()
num_one_off_journeys = df_in[df_in.Occurrences==1].shape[0]
one_off_occ = df_in[df_in.Occurrences==1].Occurrences.sum()

In [18]:
all_occ, num_one_off_journeys, one_off_occ

(14603761, 5129478, 5129478)

In [23]:
df_in.sort_values("Occurrences", ascending=False).head()

Unnamed: 0,Occurrences,PageSeq_Length,PageSequence
609519,495376,2,/government/organisations/companies-house>>/ge...
1168172,229609,2,/universal-credit>>/sign-in-universal-credit
4622735,164249,2,/sign-in-universal-credit>>/sign-in-universal-...
2316097,142012,2,/government/organisations/hm-revenue-customs>>...
54671,94502,2,/check-mot-history>>/check-mot-history


### Add `Page_List` column

In [26]:
pagelist = [pageseq.split(">>") for pageseq in df_in['PageSequence'].values]
df_in['Page_List'] = pagelist

In [28]:
df_in['Page_List'].head()

0    [/topic/help-british-nationals-overseas, /topi...
1    [/child-benefit/eligibility, /child-benefit-ta...
2    [/future-pension-centre, /check-state-pension,...
3    [/uk-visa-sponsorship-employers, /uk-visa-spon...
4    [/bankruptcy/restrictions, /bankruptcy, /searc...
Name: Page_List, dtype: object

In [33]:
page_views = {}
for tup in df_in.itertuples():
    for p in tup.Page_List:
        if p in page_views.keys():
            page_views[p] += tup.Occurrences
        else:
            page_views[p] = tup.Occurrences

In [39]:
len(page_views), sum(page_views.values())

(2092293, 69467448)

### Save out 

In [34]:
bq_dir = os.path.join(os.getenv("DATA_DIR"),"raw", "bq_journey_extract")
bq_file = os.path.join(bq_dir, "pageseq_user_journey_feb_11_18.csv.gz")
bq_file_doo = os.path.join(bq_dir, "pageseq_user_journey_feb_11_18_doo.csv.gz")
page_views_file = os.path.join(bq_dir, "pageviews_feb_11_18.csv.gz")

In [None]:
df_in.to_csv(bq_file, compression="gzip", sep='\t', index=False)

In [None]:
df_in[df_in.Occurrences>1].to_csv(bq_file_doo, compression="gzip", sep='\t', index=False)

In [35]:
import gzip
with gzip.open(page_views_file, "wb") as writer:
    writer.write("page_url\tviews\n".encode())
    for key,value in page_views.items():
        writer.write("{}\t{}\n".format(key,value).encode())