In [1]:
import os 
import pandas as pd
import re
import json

import ast

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

In [2]:
REQUIRED_COLUMNS = ["Occurrences", "ABVariant", "Page_Event_List",
                    "Page_List",  "Event_cat_act_agg"]

In [3]:
DATA_DIR = os.getenv("DATA_DIR")

In [4]:
loved_pages_df = pd.read_csv(
    os.path.join(DATA_DIR, 'metadata/loved_pages.csv.gz'),
    usecols=['pagePath'])

In [5]:
# dedupe the pagePaths here just in case
loved_page_paths = list(set(loved_pages_df['pagePath'].tolist()))

In [6]:
len(loved_page_paths)

5977

In [7]:
loved_page_paths_set = set(loved_page_paths)

In [8]:
loved_smart_answers_df = pd.read_csv(
    os.path.join(DATA_DIR, 'metadata/loved_smart_answers.csv.gz'),
    usecols=['pagePath'])

In [9]:
loved_smart_answers_df['pagePath'] = loved_smart_answers_df['pagePath'] + '/'

In [10]:
loved_smart_answers = list(set(
    loved_smart_answers_df['pagePath'].tolist()))

the pages come from links in this page https://www.gov.uk/government/organisations/hm-revenue-customs/contact

In [13]:
with open(os.path.join(DATA_DIR, 'metadata/hmrc_contact_pages.json'), "r") as read_file:
    contact_pages = json.load(read_file)

In [14]:
hmrc_contact_pages = [link['base_path'] for link in contact_pages['links']['children']]

In [15]:
hmrc_contact_pages_set = set(hmrc_contact_pages)

In [16]:
def is_loved_page(page):
    return any([re.match('/foreign-travel-advice/',page),
                page in hmrc_contact_pages_set,
                page in loved_page_paths_set,
                page == '/help',
               any([pagepath in page for pagepath in loved_smart_answers])])


In [17]:
def is_loved_journey(page_list):
#     we dedupe each pagelist here
    return any(
        [is_loved_page(page) for page in list(set(page_list))])

In [19]:
df_2019_02_14 = pd.read_csv(
    os.path.join(DATA_DIR, 'processed_journey/taxon_ab_2019-02-18.csv.gz'), sep='\t', 
    usecols=REQUIRED_COLUMNS)

In [20]:
df_2019_02_14['Page_List'] = df_2019_02_14['Page_List'].progress_apply(ast.literal_eval)

100%|██████████| 1626133/1626133 [00:34<00:00, 47676.16it/s]


In [21]:
df_2019_02_14['is_loved_journey'] = df_2019_02_14['Page_List'].progress_apply(is_loved_journey)

100%|██████████| 1626133/1626133 [00:32<00:00, 49625.60it/s]


In [22]:
df_2019_02_14.groupby('is_loved_journey').sum()

Unnamed: 0_level_0,Occurrences
is_loved_journey,Unnamed: 1_level_1
False,1076092
True,3056925


In [70]:
df_2019_02_14[df_2019_02_14['is_loved_journey']==False].to_csv(
    '../data/processed_journey/unloved_2019-02-14.csv.gz', 
    sep="\t", compression="gzip", index=False)

In [72]:
df_2019_02_14[df_2019_02_14['is_loved_journey']==True].to_csv(
    '../data/processed_journey/loved_2019-02-14.csv.gz', 
    sep="\t", compression="gzip", index=False)

In [23]:
def split_daily_data(file_prefix):
    df = pd.read_csv(
        f'../data/processed_journey/taxon_ab_{file_prefix}.csv.gz', sep='\t', 
        usecols=REQUIRED_COLUMNS)
    df['Page_List'] = df['Page_List'].progress_apply(
        ast.literal_eval)
    df['is_loved_journey'] = df['Page_List'].progress_apply(is_loved_journey)
    print("Number of occurences of journeys of this type")
    print(df.groupby('is_loved_journey').sum())
    df[df['is_loved_journey']==False].to_csv(
        f'../data/processed_journey/unloved_{file_prefix}.csv.gz', 
        sep="\t", compression="gzip", index=False)
    df[df['is_loved_journey']==True].to_csv(
        f'../data/processed_journey/loved_{file_prefix}.csv.gz', 
        sep="\t", compression="gzip", index=False)

In [74]:
split_daily_data('2019-02-15')


  0%|          | 0/1294250 [00:00<?, ?it/s][A
  1%|          | 6521/1294250 [00:00<00:19, 65171.99it/s][A
  1%|          | 11877/1294250 [00:00<00:20, 61178.82it/s][A
  1%|▏         | 17023/1294250 [00:00<00:22, 57895.28it/s][A
  2%|▏         | 22244/1294250 [00:00<00:22, 56063.26it/s][A
  2%|▏         | 27421/1294250 [00:00<00:23, 54701.48it/s][A
  3%|▎         | 32552/1294250 [00:00<00:23, 53636.37it/s][A
  3%|▎         | 37596/1294250 [00:00<00:23, 52634.51it/s][A
  3%|▎         | 42875/1294250 [00:00<00:23, 52679.35it/s][A
  4%|▎         | 48437/1294250 [00:00<00:23, 53527.13it/s][A
  4%|▍         | 53810/1294250 [00:01<00:23, 53584.48it/s][A
  5%|▍         | 58998/1294250 [00:01<00:23, 52378.77it/s][A
  5%|▍         | 64123/1294250 [00:01<00:23, 51949.82it/s][A
  5%|▌         | 69240/1294250 [00:01<00:23, 51671.87it/s][A
  6%|▌         | 74353/1294250 [00:01<00:23, 51324.62it/s][A
  6%|▌         | 79448/1294250 [00:01<00:23, 51084.41it/s][A
  7%|▋         | 84531/

                  Occurrences
is_loved_journey             
False                  866260
True                  2452930


In [75]:
split_daily_data('2019-02-16')


  0%|          | 0/810323 [00:00<?, ?it/s][A
  1%|          | 4842/810323 [00:00<00:16, 48419.01it/s][A
  1%|▏         | 10137/810323 [00:00<00:16, 49694.28it/s][A
  2%|▏         | 15437/810323 [00:00<00:15, 50640.57it/s][A
  3%|▎         | 20784/810323 [00:00<00:15, 51457.15it/s][A
  3%|▎         | 25811/810323 [00:00<00:15, 51093.76it/s][A
  4%|▍         | 31268/810323 [00:00<00:14, 52088.67it/s][A
  5%|▍         | 36679/810323 [00:00<00:14, 52677.91it/s][A
  5%|▌         | 41930/810323 [00:00<00:14, 52626.16it/s][A
  6%|▌         | 47419/810323 [00:00<00:14, 53284.21it/s][A
  7%|▋         | 52967/810323 [00:01<00:14, 53922.58it/s][A
  7%|▋         | 58495/810323 [00:01<00:13, 54321.51it/s][A
  8%|▊         | 63837/810323 [00:01<00:13, 53789.59it/s][A
  9%|▊         | 69357/810323 [00:01<00:13, 54204.95it/s][A
  9%|▉         | 74794/810323 [00:01<00:13, 54254.18it/s][A
 10%|▉         | 80190/810323 [00:01<00:13, 53875.34it/s][A
 11%|█         | 85558/810323 [00:01<00

                  Occurrences
is_loved_journey             
False                  442807
True                  1601995


In [76]:
split_daily_data('2019-02-17')


  0%|          | 0/849131 [00:00<?, ?it/s][A
  1%|          | 4516/849131 [00:00<00:18, 45143.79it/s][A
  1%|          | 9643/849131 [00:00<00:17, 46821.55it/s][A
  2%|▏         | 14984/849131 [00:00<00:17, 48620.62it/s][A
  2%|▏         | 20177/849131 [00:00<00:16, 49567.38it/s][A
  3%|▎         | 24936/849131 [00:00<00:16, 48953.64it/s][A
  4%|▎         | 30226/849131 [00:00<00:16, 50073.65it/s][A
  4%|▍         | 35549/849131 [00:00<00:15, 50980.24it/s][A
  5%|▍         | 40707/849131 [00:00<00:15, 51158.38it/s][A
  5%|▌         | 46033/849131 [00:00<00:15, 51770.22it/s][A
  6%|▌         | 51458/849131 [00:01<00:15, 52489.23it/s][A
  7%|▋         | 56790/849131 [00:01<00:15, 52731.31it/s][A
  7%|▋         | 62105/849131 [00:01<00:14, 52855.63it/s][A
  8%|▊         | 67394/849131 [00:01<00:14, 52864.93it/s][A
  9%|▊         | 72660/849131 [00:01<00:14, 52801.22it/s][A
  9%|▉         | 77989/849131 [00:01<00:14, 52946.47it/s][A
 10%|▉         | 83273/849131 [00:01<00:

                  Occurrences
is_loved_journey             
False                  445598
True                  1626468


In [77]:
split_daily_data('2019-02-18')


  0%|          | 0/1626133 [00:00<?, ?it/s][A
  0%|          | 5827/1626133 [00:00<00:27, 58269.36it/s][A
  1%|          | 11596/1626133 [00:00<00:27, 58092.01it/s][A
  1%|          | 16430/1626133 [00:00<00:29, 54774.81it/s][A
  1%|▏         | 21548/1626133 [00:00<00:29, 53642.77it/s][A
  2%|▏         | 26837/1626133 [00:00<00:29, 53414.02it/s][A
  2%|▏         | 31980/1626133 [00:00<00:30, 52794.64it/s][A
  2%|▏         | 36872/1626133 [00:00<00:30, 51562.63it/s][A
  3%|▎         | 42123/1626133 [00:00<00:30, 51842.77it/s][A
  3%|▎         | 47264/1626133 [00:00<00:30, 51708.88it/s][A
  3%|▎         | 52436/1626133 [00:01<00:30, 51708.68it/s][A
  4%|▎         | 57449/1626133 [00:01<00:30, 50828.33it/s][A
  4%|▍         | 62445/1626133 [00:01<00:30, 50564.51it/s][A
  4%|▍         | 67430/1626133 [00:01<00:30, 50347.59it/s][A
  4%|▍         | 72413/1626133 [00:01<00:31, 50031.99it/s][A
  5%|▍         | 77430/1626133 [00:01<00:30, 50073.29it/s][A
  5%|▌         | 82413/

                  Occurrences
is_loved_journey             
False                 1076092
True                  3056925


do some analysis here so it will run after the stuff above

In [78]:
import numpy as np


In [29]:
unloved_list = ['../data/processed_journey/unloved_2019-02-14.csv.gz',
               '../data/processed_journey/unloved_2019-02-15.csv.gz',
               '../data/processed_journey/unloved_2019-02-16.csv.gz',
               '../data/processed_journey/unloved_2019-02-17.csv.gz',
               '../data/processed_journey/unloved_2019-02-18.csv.gz']

In [30]:
# read in processed sampled journey with just the cols we need for related links
unloved_df = pd.concat(
    [pd.read_csv(
        filepath, sep ="\t", compression="gzip"
    ) for filepath in unloved_list])


In [105]:
unloved_df['Event_cat_act_agg']= unloved_df['Event_cat_act_agg'].progress_apply(
    ast.literal_eval)




  0%|          | 0/1506675 [00:00<?, ?it/s][A[A

  0%|          | 2271/1506675 [00:00<01:06, 22700.66it/s][A[A

  0%|          | 4739/1506675 [00:00<01:04, 23260.25it/s][A[A

  0%|          | 6883/1506675 [00:00<01:06, 22680.28it/s][A[A

  1%|          | 9059/1506675 [00:00<01:06, 22390.88it/s][A[A

  1%|          | 11271/1506675 [00:00<01:07, 22305.48it/s][A[A

  1%|          | 13124/1506675 [00:00<01:11, 20750.89it/s][A[A

  1%|          | 14957/1506675 [00:00<01:15, 19703.85it/s][A[A

  1%|          | 16835/1506675 [00:00<01:16, 19416.08it/s][A[A

  1%|          | 18667/1506675 [00:00<01:18, 18959.14it/s][A[A

  1%|▏         | 20688/1506675 [00:01<01:16, 19316.94it/s][A[A

  2%|▏         | 22633/1506675 [00:01<01:16, 19352.11it/s][A[A

  2%|▏         | 24533/1506675 [00:01<01:17, 19165.05it/s][A[A

  2%|▏         | 26457/1506675 [00:01<01:17, 19186.10it/s][A[A

  2%|▏         | 28467/1506675 [00:01<01:15, 19451.02it/s][A[A

  2%|▏         | 30439/1506

In [53]:
df_all_list = ['processed_journey/taxon_ab_2019-02-14.csv.gz',
               'processed_journey/taxon_ab_2019-02-15.csv.gz',
               'processed_journey/taxon_ab_2019-02-16.csv.gz',
               'processed_journey/taxon_ab_2019-02-17.csv.gz',
               'processed_journey/taxon_ab_2019-02-18.csv.gz']

In [54]:
df_all = pd.concat([
    pd.read_csv(os.path.join(DATA_DIR, file), sep ="\t", compression="gzip", usecols=REQUIRED_COLUMNS)
    for file in df_all_list])

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [34]:
def get_number_of_events_rl(event):
    """Counts events with category 'relatedLinkClicked' and action'Related content'."""
    if event[0][0] == 'relatedLinkClicked' and 'Related content' in event[0][1]:
        return event[1]
    return 0


def sum_related_click_events(event_list):
    return sum([get_number_of_events_rl(event) for event in event_list])


def is_related(x):
    """Compute whether a journey includes at least one related link click."""
    return x > 0

In [55]:
df_18['Event_cat_act_agg'] = df_18['Event_cat_act_agg'].apply(ast.literal_eval)

In [56]:
# get the number of related links clicks per Sequence
df_18['Related Links Clicks per seq'] = df_18['Event_cat_act_agg'].map(sum_related_click_events)

In [59]:
# map across the Sequence variable, which includes pages and Events
# we want to pass all the list elements to a function one-by-one and then collect the output.
df_18["Has_Related"] = df_18["Related Links Clicks per seq"].map(is_related)

# df_all['Related Links Clicks row total'] = df_all['Related Links Clicks per seq'] * df_all['Occurrences']


In [45]:
df_18['Page_List'] = df_18['Page_List'].apply(ast.literal_eval)

In [48]:
df_18['primary'] = df_18['Page_List'].apply(lambda x: '/government/statistics/primary-school-performance-tables-2018' in x)

In [60]:
df_18[(df_18['primary']) & (df_18['Has_Related'])]

Unnamed: 0,Occurrences,ABVariant,Page_Event_List,Page_List,Event_cat_act_agg,Related Links Clicks per seq,Has_Related,Related Links Clicks row total,primary
281035,1,B,[('/government/statistics/key-stage-2-and-mult...,[/government/statistics/key-stage-2-and-multi-...,"[((PAGE_NULL, PAGE_NULL), 18), ((relatedLinkCl...",1,True,0,True
402158,1,B,[('/government/statistics/secondary-school-per...,[/government/statistics/secondary-school-perfo...,"[((PAGE_NULL, PAGE_NULL), 6), ((relatedLinkCli...",2,True,0,True
1049205,1,B,[('/government/statistics/primary-school-perfo...,[/government/statistics/primary-school-perform...,"[((PAGE_NULL, PAGE_NULL), 2), ((relatedLinkCli...",1,True,0,True
1292185,1,B,[('/government/statistics/primary-school-perfo...,[/government/statistics/primary-school-perform...,"[((PAGE_NULL, PAGE_NULL), 4), ((relatedLinkCli...",2,True,0,True
1477932,1,A,[('/government/statistics/primary-school-perfo...,[/government/statistics/primary-school-perform...,"[((PAGE_NULL, PAGE_NULL), 2), ((relatedLinkCli...",1,True,0,True
1557504,1,B,"[('/', 'PAGE<:<NULL<:<NULL', 'other'), ('/sear...","[/, /search?q=key+stage+2, /government/collect...","[((PAGE_NULL, PAGE_NULL), 10), ((searchResults...",1,True,0,True


In [61]:
df_18['Page_List'][1477932]

['/government/statistics/primary-school-performance-tables-2018',
 '/government/statistics/primary-school-performance-tables-2017']

In [62]:
df_18['Page_Event_List'][1477932]

"[('/government/statistics/primary-school-performance-tables-2018', 'PAGE<:<NULL<:<NULL', 'f0dcb0b9-2f2c-46b0-adea-42a4fe608170'), ('/government/statistics/primary-school-performance-tables-2018', 'EVENT<:<relatedLinkClicked<:<1.1 Related content', 'f0dcb0b9-2f2c-46b0-adea-42a4fe608170'), ('/government/statistics/primary-school-performance-tables-2017', 'PAGE<:<NULL<:<NULL', 'f0dcb0b9-2f2c-46b0-adea-42a4fe608170')]"

In [32]:
# get the number of related links clicks per Sequence
unloved_df['Related Links Clicks per seq'] = unloved_df['Event_cat_act_agg'].map(sum_related_click_events)

KeyboardInterrupt: 

In [None]:
# map across the Sequence variable, which includes pages and Events
# we want to pass all the list elements to a function one-by-one and then collect the output.
unloved_df["Has_Related"] = unloved_df["Related Links Clicks per seq"].map(is_related)

unloved_df['Related Links Clicks row total'] = unloved_df['Related Links Clicks per seq'] * unloved_df['Occurrences']


In [None]:
unloved_df.reset_index(drop=True, inplace=True)

In [None]:
unloved_df[unloved_df['']]

In [110]:
# look at total number o fjourneys and percent with rl click
n = unloved_df.Occurrences.sum()
p = unloved_df[unloved_df.Has_Related == 1].Occurrences.sum() / n
print(n)
print(p)

3771859
0.006247582425536055


In [111]:
df = unloved_df

In [112]:
df.groupby('ABVariant').sum()

Unnamed: 0_level_0,Occurrences,is_loved_journey,Related Links Clicks per seq,Has_Related,Related Links Clicks row total
ABVariant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1905727,0.0,802,608.0,818
B,1866132,0.0,30546,22225.0,31266


In [113]:

# A
# number of trials for page A
n_a = df[df.ABVariant == "A"].Occurrences.sum()
# number of successes (occurrences), for page A and at least one related link clicked journeys
x_a = df[(df['ABVariant'] == 'A') & (df['Has_Related'] == 1)].Occurrences.sum()
# prop of journeys where one related link was clicked, on A
p_a = x_a / n_a

# B
# number of trials for page B
n_b = df[df.ABVariant == "B"].Occurrences.sum()
# number of successes for page B, at least one related link clicked
x_b = df[(df['ABVariant'] == 'B') & (df['Has_Related'] == 1)].Occurrences.sum()
# prop of journeys where one related link was clicked, on B
p_b = x_b / n_b

assert (n == n_a + n_b), "Error in filtering by ABVariant!"

# validate assumptions
# The formula of z-statistic is valid only when sample size (n) is large enough.
# nAp, nAq, nBp and nBq should be ≥ 5.
# where p is probability of success (we can use current baseline)
# q = 1 - p

# tried a helper function here but it didn't work hence not DRY
assert (n_a*p) >= 5, "Assumptions for z prop test invalid!"
assert (n_a*(1-p)) >= 5, "Assumptions for z prop test invalid!"

assert (n_b*p) >= 5, "Assumptions for z prop test invalid!"
assert (n_b*(1-p)) >= 5, "Assumptions for z prop test invalid!"

In [114]:
#a variant
print(n_a)
print(x_a)
print(p_a)

1905727
624
0.000327434097328736


In [115]:
#b variant
print(n_b)
print(x_b)
print(p_b)

1866132
22941
0.01229334259312846


some journeys in the A variant still have related link clicked events, when we should not have any journeys that have related links in this "unloved" segment, so let's look at some examples where a related link click has been reported

In [139]:
df[(df['ABVariant'] == 'A') & (df['Has_Related'] == 1)]['Page_List']




8644       ['/government/publications/cic53-application-t...
12496                                                     []
13371                                                     []
14176      ['/government/publications/genuine-hmrc-contac...
16829                                                     []
19260                                                     []
22175      ['/government/publications/application-to-sett...
26298                                                     []
26904      ['/mot-tester-training-assessments', '/mot-tes...
26966                                                     []
27632                                                     []
28820                                                     []
29933                                                     []
31063      ['/browse/visas-immigration/tourist-short-stay...
32856                                                     []
32925              ['/browse/benefits/jobseekers-allowance']
33527                   

look at the Page_List and Page_Event_list fields for these unexpected rows

In [140]:
print(df['Page_List'][1468336])

print(df['Page_Event_List'][1468336])
# looks like there was some flipping between A and B here as there are no 
# related lnks in the A variant for /government/statistics/primary-school-performance-tables-2018

['/government/statistics/primary-school-performance-tables-2018', '/government/statistics/primary-school-performance-tables-2017']
[('/government/statistics/primary-school-performance-tables-2018', 'PAGE<:<NULL<:<NULL', 'f0dcb0b9-2f2c-46b0-adea-42a4fe608170'), ('/government/statistics/primary-school-performance-tables-2018', 'EVENT<:<relatedLinkClicked<:<1.1 Related content', 'f0dcb0b9-2f2c-46b0-adea-42a4fe608170'), ('/government/statistics/primary-school-performance-tables-2017', 'PAGE<:<NULL<:<NULL', 'f0dcb0b9-2f2c-46b0-adea-42a4fe608170')]


In [141]:
is_loved_page('/government/statistics/primary-school-performance-tables-2018')

False

In [142]:
print(df['Page_List'][26904])

print(df['Page_Event_List'][26904])
# we need
# /mot-testing-service?source=mot-testing.i-env.net need to strip out the 
# source param to work out this page has related links

['/mot-tester-training-assessments', '/mot-tester-training-assessments/training', '/topic/mot/manuals', '/mot-tester-training-assessments/training', '/government/publications/alternative-fuel-vehicles-guidance-for-mot-testers/hybrid-electric-and-hydrogen-fuel-cell-systems-guidance-for-mot-testers', '/mot-tester-training-assessments/training', '/government/publications/mot-test-quality-information-guidance-for-mot-testers-and-managers', '/government/publications/mot-test-quality-information-guidance-for-mot-testers-and-managers/using-mot-test-quality-information-guidance-for-mot-testers', '/mot-testing-service?source=mot-testing.i-env.net', '/mot-testing-service?source=mot-testing.i-env.net', '/mot-tester-training-assessments', '/mot-tester-training-assessments/training', '/government/publications/mot-test-quality-information-guidance-for-mot-testers-and-managers', '/government/publications/mot-test-quality-information-guidance-for-mot-testers-and-managers/using-mot-test-quality-informa

In [143]:
is_loved_page('/mot-testing-service?source=mot-testing.i-env.net')

False

In [146]:
print(df['Page_List'][1459919])

print(df['Page_Event_List'][1459919])
# https://www.gov.uk/premises-licence/north-somerset all pages starting with 
# /premises-licence have the same content ID, and have related links at the 
# moment, so we should count any pagePath beginning with /premises-licence/ as loved

['/premises-licence/north-somerset', '/personal-licence-to-sell-alcohol', '/personal-licence-to-sell-alcohol/north-somerset', '/find-local-council', '/find-local-council/north-somerset']
[('/premises-licence/north-somerset', 'PAGE<:<NULL<:<NULL', 'da0bc015-f8e5-492c-8d81-6fbf9b18947c,1327984f-95e0-4ca7-94c7-c63e69c30924'), ('/premises-licence/north-somerset', 'EVENT<:<relatedLinkClicked<:<1.4 Related content', 'da0bc015-f8e5-492c-8d81-6fbf9b18947c,1327984f-95e0-4ca7-94c7-c63e69c30924'), ('/personal-licence-to-sell-alcohol', 'PAGE<:<NULL<:<NULL', 'da0bc015-f8e5-492c-8d81-6fbf9b18947c'), ('/personal-licence-to-sell-alcohol', 'EVENT<:<postcodeSearch:licence<:<postcodeSearchStarted', 'da0bc015-f8e5-492c-8d81-6fbf9b18947c'), ('/personal-licence-to-sell-alcohol/north-somerset', 'PAGE<:<NULL<:<NULL', 'da0bc015-f8e5-492c-8d81-6fbf9b18947c'), ('/find-local-council', 'PAGE<:<NULL<:<NULL', '15191831-40b4-4f6e-ade3-e82d7c775afd'), ('/find-local-council', 'EVENT<:<postcodeSearch:find_local_council<

In [147]:
is_loved_page('/premises-licence/north-somerset')

False

In [148]:
is_loved_page('/premises-licence')

True

In [149]:
print(df['Page_List'][1482052])

print(df['Page_Event_List'][1482052])
# related link click event on the page /trusts-taxes, this appears in
# Page_Event_List but not Page_List, so maybe we should be checking both to
# check for loved/unloved -if no sign of related links in Page_List, check 
# Page_Event_List to mop up these edge cases

['/browse/tax/inheritance-tax']
[('/browse/tax', 'EVENT<:<secondLevelBrowseLinkClicked<:<5', 'other'), ('/browse/tax/inheritance-tax', 'PAGE<:<NULL<:<NULL', 'other'), ('/browse/tax', 'EVENT<:<thirdLevelBrowseLinkClicked<:<1.8', 'other'), ('/trusts-taxes', 'EVENT<:<relatedLinkClicked<:<1.5 Related content', '3bc4ec93-fd86-4c66-98d0-7623cbbaa6be,104ee859-8278-406b-80cb-5727373e0198,1f3d1ae8-aba7-4e02-abbc-6b54e6ff66aa,41d3523d-34c4-445f-80c8-d429663cb184'), ('/wills-probate-inheritance', 'EVENT<:<relatedLinkClicked<:<1.2 Related content', '3bc4ec93-fd86-4c66-98d0-7623cbbaa6be,0fffa994-b76d-4539-8bf9-2a6c6751580d,4b118ae1-d783-47a8-bdec-905a1bf2ca9e'), ('/make-will', 'EVENT<:<relatedLinkClicked<:<1.2 Related content', '0fffa994-b76d-4539-8bf9-2a6c6751580d'), ('/inheritance-tax', 'EVENT<:<relatedLinkClicked<:<1.2 Related content', '0fffa994-b76d-4539-8bf9-2a6c6751580d,1f3d1ae8-aba7-4e02-abbc-6b54e6ff66aa'), ('/trusts-taxes', 'EVENT<:<contentsClicked<:<content_item 8', '3bc4ec93-fd86-4c66-9

In [151]:
df['Page_Event_List'] = df['Page_Event_List'].progress_apply(
    ast.literal_eval)

In [124]:
[page[0] for page in df['Page_Event_List'][8644]]

['/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/form-cic37-application-to-convert-a-company-to-a-cic',
 '/government/publications/cic53-application-to-transfer-assets',
 '/government/publications/cic53-application-to-transfer-assets',
 '/government/publications/community-interest-companies-business-activities',
 '/government/publications/community-interest-compan

In [203]:
df['Page_List'] = df['Page_List'].apply(ast.literal_eval)

In [234]:
# def is_loved_page(page):
#     return any([re.match('/foreign-travel-advice/',page),
#                 page in hmrc_contact_pages_set,
#                 page in loved_page_paths_set,
#                 page == '/help',
#                any([pagepath in page for pagepath in loved_smart_answers])])

def is_loved_page_2(page):
    return any([
        re.match('/premises-licence/',page),
                is_loved_page(page.split('?')[0])])

def is_loved_page_2b(page):
    return any([re.match('/foreign-travel-advice/',page),
                page in hmrc_contact_pages_set,
                page in loved_page_paths_set,
                page == '/help',
                re.match('/premises-licence/',page),
                is_loved_page(page.split('?')[0]),
               any([pagepath in page for pagepath in loved_smart_answers])])

def is_loved_page_event_list_2(page_event_list):
    return any([is_loved_page_2b(triple[0]) for triple in page_event_list])

In [213]:
def is_loved_journey_step_2_df(df, target_column):
#     def row_fn(row, target_column):
#         print(row['Page_List'])
    def is_loved_journey_step_2(row, row_accumulator, target_column):
        new_row = row.to_dict()            
        if any([
            any([is_loved_page_2(page) for page in row['Page_List']]), 
                is_loved_page_event_list_2(row['Page_Event_List'])]):
            new_row[target_column] = True
        else:
            new_row[target_column] = False
        row_accumulator.append(new_row)
    
    new_rows = []    
    df.apply(is_loved_journey_step_2,axis=1,args=(new_rows, target_column))
#     df.apply(row_fn,axis=1, args=(target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [214]:
is_loved_journey_step_2_result_df = is_loved_journey_step_2_df(df, 'is_loved_2')

In [95]:
df[(df['ABVariant'] == 'A') & (df['Has_Related'] == 1)]['Page_List']


True

In [219]:
is_loved_journey_step_2_result_df[
    (is_loved_journey_step_2_result_df['ABVariant'] == 'A') & 
    (is_loved_journey_step_2_result_df['Has_Related'] == 1) &
    (is_loved_journey_step_2_result_df['is_loved_2'] == False)]['Page_List']

8644       [/government/publications/cic53-application-to...
34745      [/government/publications/form-n460-reasons-fo...
44418      [/hunting/birds, /shotgun-and-firearm-certific...
96899                                                     []
97661      [/government/publications/form-n5-claim-form-f...
131008     [/government/publications/national-insurance-s...
142673     [/guidance/the-g-cloud-framework-on-the-digita...
161840     [/government/publications/stamp-duty-land-tax-...
179332     [/guidance/european-temporary-leave-to-remain-...
185789     [/government/publications/heavy-good-vehicle-d...
223507     [/government/publications/construction-industr...
229671     [/government/organisations/department-for-busi...
231126     [/guidance/zika-virus-travel-advice, /guidance...
238660     [/government/organisations/hm-passport-office,...
247801     [/government/publications/upper-tribunal-tax-a...
250217     [/government/publications/hmrc-exchange-rates-...
253103     [/government/

In [224]:
is_loved_journey_step_2_result_df['Page_Event_List'][8644]
# /government/publications/form-cic37-application-to-convert-a-company-to-a-cic
# flipping between A and B?

In [227]:
print(is_loved_page('/hunting/birds'))
print(is_loved_page('/hunting/Birds'))
# case sensitivity

is_loved_journey_step_2_result_df['Page_Event_List'][44418]
# /hunting/birds, but the slug is Birds

False
True


[('/hunting/birds',
  'PAGE<:<NULL<:<NULL',
  '8953ca7a-ed45-49a9-93af-87d55e69f910,695dc6f2-27cc-4d8b-adc4-8d07c2bff748'),
 ('/hunting/birds',
  'EVENT<:<relatedLinkClicked<:<1.1 Related content',
  '8953ca7a-ed45-49a9-93af-87d55e69f910,695dc6f2-27cc-4d8b-adc4-8d07c2bff748'),
 ('/shotgun-and-firearm-certificates',
  'PAGE<:<NULL<:<NULL',
  '495afdb6-47be-4df1-8b38-91c8adb1eefc'),
 ('/shotgun-and-firearm-certificates',
  'EVENT<:<External Link Clicked<:<http://www.police.uk/?view=force_sites',
  '495afdb6-47be-4df1-8b38-91c8adb1eefc'),
 ('/shotgun-and-firearm-certificates',
  'PAGE<:<NULL<:<NULL',
  '495afdb6-47be-4df1-8b38-91c8adb1eefc'),
 ('/hunting/birds',
  'PAGE<:<NULL<:<NULL',
  '8953ca7a-ed45-49a9-93af-87d55e69f910,695dc6f2-27cc-4d8b-adc4-8d07c2bff748')]

In [228]:
is_loved_journey_step_2_result_df['Page_Event_List'][34745]
# /government/publications/form-n460-reasons-for-allowing-or-refusing-permission-to-appeal-including-referral-to-the-court-of-appeal-civil-division-and-information-concern
# flipping between A and B?

[('/government/publications/form-n460-reasons-for-allowing-or-refusing-permission-to-appeal-including-referral-to-the-court-of-appeal-civil-division-and-information-concern',
  'PAGE<:<NULL<:<NULL',
  'ba951b09-5146-43be-87af-44075eac3ae9'),
 ('/government/publications/form-n460-reasons-for-allowing-or-refusing-permission-to-appeal-including-referral-to-the-court-of-appeal-civil-division-and-information-concern',
  'EVENT<:<relatedLinkClicked<:<1.1 Related content',
  'ba951b09-5146-43be-87af-44075eac3ae9'),
 ('/government/publications/form-n460hc-reasons-for-allowing-or-refusing-permission-to-appeal-and-information-concerning-routes-of-appeal',
  'EVENT<:<user_satisfaction_survey<:<banner_shown',
  'ba951b09-5146-43be-87af-44075eac3ae9'),
 ('/government/publications/form-n460hc-reasons-for-allowing-or-refusing-permission-to-appeal-and-information-concerning-routes-of-appeal',
  'PAGE<:<NULL<:<NULL',
  'ba951b09-5146-43be-87af-44075eac3ae9'),
 ('/government/publications/form-n460hc-rea

In [229]:
is_loved_journey_step_2_result_df['Page_Event_List'][415910]
# /government/publications/application-to-settle-in-the-uk-form-setm
# flipping?

[('/uk-family-visa',
  'PAGE<:<NULL<:<NULL',
  'd612c61e-22f4-4922-8bb2-b04b9202126e'),
 ('/search?q=spouse+partner+leave+to+remain', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/search?q=spouse+partner+leave+to+remain',
  'EVENT<:<searchResults<:<resultsShown',
  'other'),
 ('/search?q=spouse+partner+leave+to+remain', 'EVENT<:<UX<:<click', 'other'),
 ('/government/publications/application-to-settle-in-the-uk-form-setm',
  'PAGE<:<NULL<:<NULL',
  '7f3e73e4-f20d-43be-9c80-021f2ac1897f'),
 ('/government/publications/application-to-settle-in-the-uk-form-setm',
  'EVENT<:<relatedLinkClicked<:<1.2 Related content',
  '7f3e73e4-f20d-43be-9c80-021f2ac1897f'),
 ('/government/publications/apply-to-extend-stay-in-the-uk-as-a-partner-or-dependent-child-form-flrm',
  'EVENT<:<user_satisfaction_survey<:<banner_shown',
  'd612c61e-22f4-4922-8bb2-b04b9202126e,ccb40a29-34ca-4a4c-b4ea-a78279047774'),
 ('/government/publications/apply-to-extend-stay-in-the-uk-as-a-partner-or-dependent-child-form-flrm',
  'PAGE<:

In [230]:
is_loved_journey_step_2_result_df['Page_Event_List'][385457]
# /view-driving-licence/verify, base_path = /view-driving-licence, details.variants.slug = verify
# not yet looking at slugs in variants field

[('/view-driving-licence/verify',
  'PAGE<:<NULL<:<NULL',
  '43994d58-a38e-4f9b-9027-feba44453173'),
 ('/view-driving-licence/verify',
  'EVENT<:<External Link Clicked<:<https://www.viewdrivingrecord.service.gov.uk/verify/start',
  '43994d58-a38e-4f9b-9027-feba44453173'),
 ('/view-driving-licence/verify',
  'PAGE<:<NULL<:<NULL',
  '43994d58-a38e-4f9b-9027-feba44453173'),
 ('/view-driving-licence/verify',
  'EVENT<:<relatedLinkClicked<:<1.4 Related content',
  '43994d58-a38e-4f9b-9027-feba44453173'),
 ('/add-driving-licence-check-code',
  'EVENT<:<user_satisfaction_survey<:<banner_shown',
  'other'),
 ('/add-driving-licence-check-code', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/add-driving-licence-check-code',
  'EVENT<:<External Link Clicked<:<https://www.add-driving-licence-check-code.service.gov.uk/digital/hold-licence',
  'other')]

In [231]:
is_loved_journey_step_2_result_df['Page_Event_List'][1224055]
# /help/terms-conditions to include
# /help/about-govuk
# /help/accessibility
# /help/privacy-policy
# /help/cookies
# /help/update-email-notifications
# /help/browsers
# /help/beta

[('/contact', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/search?q=head+office', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/search?q=head+office', 'EVENT<:<searchResults<:<resultsShown', 'other'),
 ('/contact', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/help/terms-conditions', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/help/terms-conditions',
  'EVENT<:<relatedLinkClicked<:<1.2 Related content',
  'other'),
 ('/help/about-govuk', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/help/about-govuk', 'EVENT<:<homeLinkClicked<:<homeHeader', 'other'),
 ('/', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/search?q=head+office', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/search?q=head+office', 'EVENT<:<searchResults<:<resultsShown', 'other'),
 ('/search?q=head+office', 'EVENT<:<UX<:<click', 'other'),
 ('/government/publications/head-office-account',
  'PAGE<:<NULL<:<NULL',
  'e491505c-77ae-45b2-84be-8c94b94f6a2b,8a98b827-82ad-49b4-819e-82c208c551c4,ccb77bcc-56b4-419a-b5ce-f7c2234e0546')]

In [232]:
is_loved_journey_step_2_result_df['Page_Event_List'][1089932]
# pagepath begins with /find-local-council/ (can have any council after it)

[('/find-local-council',
  'PAGE<:<NULL<:<NULL',
  '15191831-40b4-4f6e-ade3-e82d7c775afd'),
 ('/find-local-council',
  'EVENT<:<postcodeSearch:find_local_council<:<postcodeSearchStarted',
  '15191831-40b4-4f6e-ade3-e82d7c775afd'),
 ('/find-local-council/sedgemoor',
  'PAGE<:<NULL<:<NULL',
  '15191831-40b4-4f6e-ade3-e82d7c775afd'),
 ('/find-local-council/sedgemoor',
  'EVENT<:<postcodeSearch:find_local_council<:<postcodeResultShown',
  '15191831-40b4-4f6e-ade3-e82d7c775afd'),
 ('/find-local-council',
  'PAGE<:<NULL<:<NULL',
  '15191831-40b4-4f6e-ade3-e82d7c775afd'),
 ('/find-local-council',
  'EVENT<:<relatedLinkClicked<:<1.1 Related content',
  '15191831-40b4-4f6e-ade3-e82d7c775afd')]

In [233]:
is_loved_journey_step_2_result_df['Page_Event_List'][1397087]


[('/contact/govuk/anonymous-feedback/thankyou', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/help/cookies', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/help/cookies',
  'EVENT<:<relatedLinkClicked<:<1.1 Related content',
  'other'),
 ('/help/about-govuk', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/help/cookies', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/contact/govuk/anonymous-feedback/thankyou', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/contact/govuk/anonymous-feedback/thankyou', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/search?q=', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/contact/govuk/anonymous-feedback/thankyou', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/browse/benefits', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/contact/govuk/anonymous-feedback/thankyou', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/', 'PAGE<:<NULL<:<NULL', 'other'),
 ('/personal-tax-account',
  'PAGE<:<NULL<:<NULL',
  'b20215a9-25fb-4fa6-80a3-42e23f5352c2,104ee859-

In [240]:
    '/hunting/birds' in [page.lower() for page in loved_page_paths_set]

True

In [244]:
# def is_loved_page_3(page):
#     return any([re.match('/foreign-travel-advice/',page),
#                 page in hmrc_contact_pages_set,
#                 page in loved_page_paths_set,
#                 page == '/help',
#                 re.match('/premises-licence/',page),
#                 is_loved_page(page.split('?')[0]),
#                any([pagepath in page for pagepath in loved_smart_answers])])

def is_loved_page_3(page):
    return any([page in ['/help/terms-conditions', '/help/about-govuk',
                         '/help/accessibility', '/help/privacy-policy',
                         '/help/cookies', '/help/update-email-notifications',
                         '/help/browsers', '/help/beta'],
                re.match('/find-local-council/',page),
                page in [page.lower() for page in loved_page_paths_set],
               ])


def is_loved_page_event_list_3(page_event_list):
    return any([is_loved_page_3(triple[0]) for triple in page_event_list])

In [245]:
def is_loved_journey_step_3_df(df, target_column):
#     def row_fn(row, target_column):
#         print(row['Page_List'])
    def is_loved_journey_step_3(row, row_accumulator, target_column):
        new_row = row.to_dict()            
        if any([
            any([is_loved_page_3(page) for page in row['Page_List']]), 
                is_loved_page_event_list_3(row['Page_Event_List'])]):
            new_row[target_column] = True
        else:
            new_row[target_column] = False
        row_accumulator.append(new_row)
    
    new_rows = []    
    df.apply(is_loved_journey_step_3,axis=1,args=(new_rows, target_column))
#     df.apply(row_fn,axis=1, args=(target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [246]:
is_loved_journey_step_3_result_df = is_loved_journey_step_3_df(is_loved_journey_step_2_result_df[
    (is_loved_journey_step_2_result_df['ABVariant'] == 'A') & 
    (is_loved_journey_step_2_result_df['Has_Related'] == 1) &
    (is_loved_journey_step_2_result_df['is_loved_2'] == False)], 'is_loved_2')

In [250]:
is_loved_journey_step_3_result_df[is_loved_journey_step_3_result_df['is_loved_2'] == False].shape

(58, 10)

In [251]:
is_loved_journey_step_3_result_df.shape

(81, 10)

In [252]:
is_loved_journey_step_3_result_df[is_loved_journey_step_3_result_df['is_loved_2'] == False]['Page_List']

0     [/government/publications/cic53-application-to...
1     [/government/publications/form-n460-reasons-fo...
3                                                    []
4     [/government/publications/form-n5-claim-form-f...
5     [/government/publications/national-insurance-s...
6     [/guidance/the-g-cloud-framework-on-the-digita...
7     [/government/publications/stamp-duty-land-tax-...
8     [/guidance/european-temporary-leave-to-remain-...
9     [/government/publications/heavy-good-vehicle-d...
10    [/government/publications/construction-industr...
11    [/government/organisations/department-for-busi...
12    [/guidance/zika-virus-travel-advice, /guidance...
13    [/government/organisations/hm-passport-office,...
14    [/government/publications/upper-tribunal-tax-a...
15    [/government/publications/hmrc-exchange-rates-...
16    [/government/publications/overseas-passport-ap...
18    [/guidance/sign-up-to-tax-free-childcare-if-yo...
19    [/government/publications/british-national

In [253]:
is_loved_journey_step_3_result_df['Page_Event_List'][61]

[('/pay-council-tax/waverley',
  'EVENT<:<relatedLinkClicked<:<1.1 Related content',
  '107b4fd7-fef4-4a86-9796-2967c95f3f3c')]

In [None]:
# '/pay-council-tax/' at the beginning of path

In [274]:
list_rl_pages = list()
for row in is_loved_journey_step_3_result_df[is_loved_journey_step_3_result_df['is_loved_2'] == False]['Page_Event_List']:
    for item in row:
        if 'relatedLinkClicked' in item[1] and 'Related content' in item[1]:
            list_rl_pages.append(item[0])
 

In [275]:
from collections import Counter


In [276]:
Counter(list_rl_pages)

Counter({'/government/publications/form-cic37-application-to-convert-a-company-to-a-cic': 1,
         '/government/publications/form-n460-reasons-for-allowing-or-refusing-permission-to-appeal-including-referral-to-the-court-of-appeal-civil-division-and-information-concern': 2,
         '/done/driving-transaction-finished': 2,
         '/government/publications/form-n5-claim-form-for-possession-of-property': 1,
         '/government/publications/national-insurance-statement-of-national-insurance-contributions-ca3916': 1,
         '/guidance/g-cloud-buyers-guide': 1,
         '/government/publications/stamp-duty-land-tax-relief-for-first-time-buyers-guidance-note': 1,
         '/guidance/european-temporary-leave-to-remain-in-the-uk': 1,
         '/government/publications/heavy-good-vehicle-drivers-daily-walkaround-check': 1,
         '/government/publications/construction-industry-scheme-payment-and-deduction-certificate': 1,
         '/business-finance-support/coventry-and-warwickshire-

# All filters

In [None]:
loved_pages_df = pd.read_csv(
    '/Users/suganyasivaskantharajah/code/govuk_ab_analysis/data/loved_pages.csv.gz',
    usecols=['pagePath'])
loved_page_paths = loved_pages_df['pagePath'].tolist()
loved_page_paths_set = set([loved_page.lower() for loved_page in loved_page_paths])


with open('../data/hmrc_contact_pages.json', "r") as read_file:
    contact_pages = json.load(read_file)

hmrc_contact_pages = [link['base_path'].lower for link in contact_pages['links']['children']]
hmrc_contact_pages_set = set(hmrc_contact_pages)
    

loved_smart_answers_df = pd.read_csv(
    '/Users/suganyasivaskantharajah/code/govuk_ab_analysis/data/loved_smart_answers.csv.gz',
    usecols=['pagePath'])

loved_smart_answers_df['pagePath'] = loved_smart_answers_df['pagePath'] + '/'
loved_smart_answers = [loved_page.lower() for loved_page in set(
    loved_smart_answers_df['pagePath'].tolist())]

In [None]:
def is_loved_page_overall(page):
    return any([re.match('/foreign-travel-advice/',page),
                page in hmrc_contact_pages_set,
                page in loved_page_paths_set,
                page == '/help',
               any([pagepath in page for pagepath in loved_smart_answers]),
               re.match('/premises-licence/',page),
                is_loved_page(page.split('?')[0]),
               page in ['/help/terms-conditions', '/help/about-govuk',
                         '/help/accessibility', '/help/privacy-policy',
                         '/help/cookies', '/help/update-email-notifications',
                         '/help/browsers', '/help/beta'],
                re.match('/find-local-council/',page)])

# filter on Page_Event_List too in case it doesn't match Page_List - e.g. 
# when page hits happen before midnight but events happen after?
def is_loved_page_event_list_overall(page_event_list):
    return any([is_loved_page_overall(triple[0]) for triple in page_event_list])

In [None]:
def is_loved_journey_overall(df, target_column):
#     def row_fn(row, target_column):
#         print(row['Page_List'])
    def is_loved_journey_overall_by_row(row, row_accumulator, target_column):
        new_row = row.to_dict()            
        if any([
            any([is_loved_page_overall(page) for page in row['Page_List']]), 
                is_loved_page_event_list_overall(row['Page_Event_List'])]):
            new_row[target_column] = True
        else:
            new_row[target_column] = False
        row_accumulator.append(new_row)
    
    new_rows = []    
    df.apply(is_loved_journey_step_2,axis=1,args=(new_rows, target_column))
#     df.apply(row_fn,axis=1, args=(target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df