In [6]:
import os 
import pandas as pd
import numpy as np
import ast
import re

# progress bar
from tqdm import tqdm, tqdm_notebook
# instantiate progress bar goodness
tqdm.pandas(tqdm_notebook)

In [4]:
unloved_list = ['../data/processed_journey/unloved_2019_02_14.csv.gz',
               '../data/processed_journey/unloved_2019_02_15.csv.gz',
               '../data/processed_journey/unloved_2019-02-16.csv.gz',
               '../data/processed_journey/unloved_2019-02-17.csv.gz',
               '../data/processed_journey/unloved_2019-02-18.csv.gz']

In [5]:
# read in processed sampled journey with just the cols we need for related links
unloved_df = pd.concat(
    [pd.read_csv(
        filepath, sep ="\t", compression="gzip"
    ) for filepath in unloved_list])


In [8]:
unloved_df['Event_cat_act_agg']= unloved_df['Event_cat_act_agg'].progress_apply(
    ast.literal_eval)


100%|██████████| 1657704/1657704 [01:15<00:00, 22060.24it/s]


In [9]:
def get_number_of_events_rl(event):
    """Counts events with category 'relatedLinkClicked' and action'Related content'."""
    if event[0][0] == 'relatedLinkClicked' and 'Related content' in event[0][1]:
        return event[1]
    return 0


def sum_related_click_events(event_list):
    return sum([get_number_of_events_rl(event) for event in event_list])


def is_related(x):
    """Compute whether a journey includes at least one related link click."""
    return x > 0

In [11]:
# get the number of related links clicks per Sequence
unloved_df['Related Links Clicks per seq'] = unloved_df['Event_cat_act_agg'].map(sum_related_click_events)

In [12]:
# map across the Sequence variable, which includes pages and Events
# we want to pass all the list elements to a function one-by-one and then collect the output.
unloved_df["Has_Related"] = unloved_df["Related Links Clicks per seq"].map(is_related)

unloved_df['Related Links Clicks row total'] = unloved_df['Related Links Clicks per seq'] * unloved_df['Occurrences']


In [21]:
unloved_df.reset_index(drop=True, inplace=True)

In [22]:
n = unloved_df.Occurrences.sum()
p = unloved_df[unloved_df.Has_Related == 1].Occurrences.sum() / n
print(n)
print(p)

4142352
0.006147715114504996


In [26]:
unloved_df.ABVariant.value_counts()

A    832293
B    825411
Name: ABVariant, dtype: int64

In [28]:
df = unloved_df

In [29]:
# A
# number of trials for page A
n_a = df[df.ABVariant == "A"].Occurrences.sum()
# number of successes (occurrences), for page A and at least one related link clicked journeys
x_a = df[(df['ABVariant'] == 'A') & (df['Has_Related'] == 1)].Occurrences.sum()
# prop of journeys where one related link was clicked, on A
p_a = x_a / n_a

# B
# number of trials for page B
n_b = df[df.ABVariant == "B"].Occurrences.sum()
# number of successes for page B, at least one related link clicked
x_b = df[(df['ABVariant'] == 'B') & (df['Has_Related'] == 1)].Occurrences.sum()
# prop of journeys where one related link was clicked, on B
p_b = x_b / n_b

assert (n == n_a + n_b), "Error in filtering by ABVariant!"

# validate assumptions
# The formula of z-statistic is valid only when sample size (n) is large enough.
# nAp, nAq, nBp and nBq should be ≥ 5.
# where p is probability of success (we can use current baseline)
# q = 1 - p

# tried a helper function here but it didn't work hence not DRY
assert (n_a*p) >= 5, "Assumptions for z prop test invalid!"
assert (n_a*(1-p)) >= 5, "Assumptions for z prop test invalid!"

assert (n_b*p) >= 5, "Assumptions for z prop test invalid!"
assert (n_b*(1-p)) >= 5, "Assumptions for z prop test invalid!"

In [31]:
print(n_a)
print(x_a)
print(p_a)

2094233
1865
0.0008905408328490669


In [32]:
print(n_b)
print(x_b)
print(p_b)

2048119
23601
0.011523256217045983


Although we're trying to remove all pages that have related links, some are sneaking through because they are not in our data dump:
- foreigh travel advice pages eg [this one](https://www.gov.uk/foreign-travel-advice/zambia)
- contact pages with quick links eg [this one](https://www.gov.uk/government/organisations/hm-revenue-customs/contact/online-services-helpdesk)
- smart answers eg [this one](/settle-in-the-uk/y/you-re-in-or-have-been-in-hm-forces)
- [the help page](https://www.gov.uk/help)
- inconsistent Page_List and Page_Event_List e.g 
  - Page_List "['/browse/visas-immigration/tourist-short-stay-visas']"
  - Page_Event_List "[('/government/publications/apply-for-a-uk-visa-in-china/3207063', 'EVENT<:<contentsClicked<:<content_item 1', 'other'), ('/government/publications/apply-for-a-uk-visa-in-china/3207063', 'EVENT<:<breadcrumbClicked<:<2', 'other'), ('/apply-to-come-to-the-uk', 'EVENT<:<relatedLinkClicked<:<1.3 Related content', 'other'), ('/apply-to-come-to-the-uk', 'EVENT<:<contentsClicked<:<content_item 2', 'other'), ('/apply-to-come-to-the-uk/prepare-your-application', 'EVENT<:<breadcrumbClicked<:<2', 'other'), ('/browse/visas-immigration', 'EVENT<:<secondLevelBrowseLinkClicked<:<3', 'other'), ('/browse/visas-immigration/tourist-short-stay-visas', 'PAGE<:<NULL<:<NULL', 'other'), ('/browse/visas-immigration', 'EVENT<:<thirdLevelBrowseLinkClicked<:<1.2', 'other'), ('/visit-uk-holiday-family-friends', 'EVENT<:<pageElementInteraction<:<stepNavShown', 'd612c61e-22f4-4922-8bb2-b04b9202126e'), ('/visit-uk-holiday-family-friends', 'EVENT<:<pageElementInteraction<:<stepNavShown', 'd612c61e-22f4-4922-8bb2-b04b9202126e'), ('/visit-uk-holiday-family-friends', 'EVENT<:<pageElementInteraction<:<stepNavShown', 'd612c61e-22f4-4922-8bb2-b04b9202126e'), ('/visit-uk-holiday-family-friends', 'EVENT<:<stepNavLinkClicked<:<2.1', 'd612c61e-22f4-4922-8bb2-b04b9202126e'), ('/apply-standard-visitor-visa?step-by-step-nav=e8254fb0-f2d9-45b1-bdc4-d17cbba11bef', 'EVENT<:<pageElementInteraction<:<stepNavShown', '29480b00-dc4d-49a0-b48c-25dda8569325'), ('/apply-standard-visitor-visa?step-by-step-nav=e8254fb0-f2d9-45b1-bdc4-d17cbba11bef', 'EVENT<:<pageElementInteraction<:<stepNavShown', '29480b00-dc4d-49a0-b48c-25dda8569325'), ('/apply-standard-visitor-visa?step-by-step-nav=e8254fb0-f2d9-45b1-bdc4-d17cbba11bef', 'EVENT<:<stepNavLinkClicked<:<3.1', '29480b00-dc4d-49a0-b48c-25dda8569325')]"

In [33]:
df[(df['ABVariant'] == 'A') & (df['Has_Related'] == 1)]['Page_List']

5104       ['/settle-in-the-uk/y/you-re-in-or-have-been-i...
5135       ['/government/organisations/hm-revenue-customs...
5528       ['/foreign-travel-advice/thailand', '/foreign-...
6409       ['/government/organisations/hm-revenue-customs...
6935       ['/browse/benefits/universal-credit', '/browse...
8695       ['/government/organisations/hm-revenue-customs...
9160       ['/foreign-travel-advice/zambia', '/foreign-tr...
9510       ['/foreign-travel-advice', '/foreign-travel-ad...
9828       ['/government/publications/cic53-application-t...
10925      ['/government/organisations/hm-revenue-customs...
11294      ['/government/organisations/hm-revenue-customs...
11541      ['/foreign-travel-advice', '/foreign-travel-ad...
11689      ['/foreign-travel-advice', '/foreign-travel-ad...
12642                      ['/help', '/contact', '/contact']
12725      ['/foreign-travel-advice/turkey', '/world/turk...
14068                                                     []
14104      ['/foreign-tr

In [44]:
df['Page_Event_List'][1648130]

"[('/government/publications/apply-for-a-uk-visa-in-china/3207063', 'EVENT<:<contentsClicked<:<content_item 1', 'other'), ('/government/publications/apply-for-a-uk-visa-in-china/3207063', 'EVENT<:<breadcrumbClicked<:<2', 'other'), ('/apply-to-come-to-the-uk', 'EVENT<:<relatedLinkClicked<:<1.3 Related content', 'other'), ('/apply-to-come-to-the-uk', 'EVENT<:<contentsClicked<:<content_item 2', 'other'), ('/apply-to-come-to-the-uk/prepare-your-application', 'EVENT<:<breadcrumbClicked<:<2', 'other'), ('/browse/visas-immigration', 'EVENT<:<secondLevelBrowseLinkClicked<:<3', 'other'), ('/browse/visas-immigration/tourist-short-stay-visas', 'PAGE<:<NULL<:<NULL', 'other'), ('/browse/visas-immigration', 'EVENT<:<thirdLevelBrowseLinkClicked<:<1.2', 'other'), ('/visit-uk-holiday-family-friends', 'EVENT<:<pageElementInteraction<:<stepNavShown', 'd612c61e-22f4-4922-8bb2-b04b9202126e'), ('/visit-uk-holiday-family-friends', 'EVENT<:<pageElementInteraction<:<stepNavShown', 'd612c61e-22f4-4922-8bb2-b04b

In [43]:
df['Page_List'][1648130]

"['/browse/visas-immigration/tourist-short-stay-visas']"

In [49]:
df.loc[1648130, :]

Occurrences                                                                       1
ABVariant                                                                         A
Page_Event_List                   [('/government/publications/apply-for-a-uk-vis...
Page_List                         ['/browse/visas-immigration/tourist-short-stay...
Event_cat_act_agg                 [((contentsClicked, content_item 1), 1), ((bre...
is_loved_journey                                                              False
Related Links Clicks per seq                                                      1
Has_Related                                                                    True
Related Links Clicks row total                                                    1
Name: 1648130, dtype: object

In [50]:
df['Page_List'][1638584]

"['/guidance/non-resident-trusts', '/government/organisations/hm-revenue-customs/contact/trusts', '/guidance/trusts-and-capital-gains-tax']"

In [51]:
df['Page_Event_List'][1638584]

"[('/guidance/non-resident-trusts', 'PAGE<:<NULL<:<NULL', 'a83ce24e-0ef4-476e-bac6-d18061e0a22e,41d3523d-34c4-445f-80c8-d429663cb184'), ('/government/organisations/hm-revenue-customs/contact/trusts', 'PAGE<:<NULL<:<NULL', 'other'), ('/government/organisations/hm-revenue-customs/contact/trusts', 'EVENT<:<relatedLinkClicked<:<1.2 Related content', 'other'), ('/guidance/trusts-and-capital-gains-tax', 'EVENT<:<user_satisfaction_survey<:<banner_shown', '3bc4ec93-fd86-4c66-98d0-7623cbbaa6be,41d3523d-34c4-445f-80c8-d429663cb184'), ('/guidance/trusts-and-capital-gains-tax', 'PAGE<:<NULL<:<NULL', '3bc4ec93-fd86-4c66-98d0-7623cbbaa6be,41d3523d-34c4-445f-80c8-d429663cb184')]"