In [38]:
import os 
import pandas as pd
import numpy as np
import ast
import re
import json

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)
from datetime import datetime
import glob


In [39]:
DATA_DIR = os.getenv("DATA_DIR")

# Page-level

### Read in data (content items)

In [40]:
clean_content_df = pd.read_csv(os.path.join(DATA_DIR, 'metadata/clean_content_links.csv')
                              )

### Keep only pages with related links

In [41]:
# only select rows that have related links (they have 
# related_mainstream_content, ordered_related_items, or quick_links)
clean_content_rl_df = clean_content_df.copy().query(
    'related_mainstream_content.notnull() or ordered_related_items.notnull()or part_of_step_navs.notnull() or quick_links.notnull()'
    )

In [42]:
clean_content_rl_df[clean_content_rl_df['base_path']=='/hunting']

Unnamed: 0,base_path,content_id,document_type,primary_publishing_organisation,publishing_app,title,ordered_related_items,quick_links,related_mainstream_content,related_guides,document_collections,part_of_step_navs,related_to_step_navs,slugs
176880,/hunting,8642926f-6bec-40c7-a158-e5e9c5361254,guide,Government Digital Service,publisher,hunting and shooting wildlife,"['9d06120f-4f08-407c-b98e-b849a48cf9a0', 'e516...",,,,,,,"['overview', 'Birds', 'mammals']"


### Tidy related link pages data

In [43]:
# fill NaNs with empty arrays, and then literal_eval all the arrays so we can
# access the items within them (the different slugs associated with each
# content ID)
clean_content_rl_df['slugs'] = clean_content_rl_df['slugs'].fillna("['']").apply(
    ast.literal_eval)

This converts the string to a list and puts an empty list where there are none

In [44]:
clean_content_rl_df.reset_index(inplace=True, drop=True)

In [45]:
# sometimes there isn't an empty slug in the list of slugs, but the page path
# exists, so this is a little hack to includ the plain basePath
def add_dummy_slug(slugs):
    list1 = ['']
    list1.extend(slugs)
    return list(set(list1))

dummmy_slug is the url without any slug which  is also a page so needs an empty slug

In [46]:
clean_content_rl_df['slugs'] = clean_content_rl_df['slugs'].apply(add_dummy_slug)

wide to long

In [47]:
# adapted from https://gist.github.com/jlln/338b4b0b55bd6984f883
def splitDataFrameList(df,target_column):
    '''
    df = dataframe to split,
    target_column = the column containing the values to split, in an array
    returns: a dataframe with each entry for the target column separated,
        with each element moved into a new row. The values in the other
        columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column):
        for s in row[target_column]:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [48]:
clean_content_exploded_df = splitDataFrameList(
    clean_content_rl_df, 'slugs')

In [49]:
def add_slash_between_basepath_slug(slug):
    if slug == '':
        return ''
    else:
        return '/' + slug

In [50]:
clean_content_exploded_df['slug'] = clean_content_exploded_df['slugs'].map(
    add_slash_between_basepath_slug)

In [51]:
clean_content_exploded_df['pagePath'] = clean_content_exploded_df['base_path']  + clean_content_exploded_df['slug']


In [52]:
clean_content_exploded_df.to_csv(os.path.join(DATA_DIR, 'metadata/loved_pages.csv.gz'),
                        compression='gzip', index=False)

SMART answers needed separately as each answer gets a new slug so the beginning of the url is matched to classify these as loved

In [53]:
clean_content_exploded_df[
    clean_content_exploded_df['document_type'] == 'simple_smart_answer'].to_csv(os.path.join(DATA_DIR, 'metadata/loved_smart_answers.csv.gz'),
                        compression='gzip', index=False)

# Journey-level

In [54]:
REQUIRED_COLUMNS = ["Occurrences", "ABVariant", "Page_Event_List",
                    "Page_List",  "Event_cat_act_agg"]

In [55]:
loved_pages_df = pd.read_csv(
    os.path.join(DATA_DIR, 'metadata/loved_pages.csv.gz'),
    usecols=['pagePath'])

In [56]:
loved_page_paths_set = set(loved_pages_df['pagePath'].tolist())

In [57]:
loved_smart_answers_df = pd.read_csv(
    os.path.join(DATA_DIR, 'metadata/loved_smart_answers.csv.gz'),
    usecols=['pagePath'])

In [58]:
loved_smart_answers_df['pagePath'] = loved_smart_answers_df['pagePath'] + '/'

In [59]:
loved_smart_answers = list(set(
    loved_smart_answers_df['pagePath'].tolist()))

the pages come from links in this page https://www.gov.uk/government/organisations/hm-revenue-customs/contact

In [60]:
with open(os.path.join(DATA_DIR, 'metadata/hmrc_contact_pages.json'), "r") as read_file:
    contact_pages = json.load(read_file)

In [61]:
hmrc_contact_pages = [link['base_path'] for link in contact_pages['links']['children']]

In [62]:
hmrc_contact_pages_set = set(hmrc_contact_pages)

In [65]:
def is_loved_page(page):
    page = page.split('?')[0]
    return any([
        re.compile(r'/foreign-travel-advice/|/find-local-council/|/premises-licence/').match(page),
        page in hmrc_contact_pages_set,
        page in loved_page_paths_set,
        page == '/help',
        page in ['/help/terms-conditions', 
         '/help/about-govuk',
         '/help/accessibility', 
         '/help/privacy-policy',
         '/help/cookies', 
         '/help/update-email-notifications',
         '/help/browsers', 
         '/help/beta'],
        any([pagepath in page for pagepath in loved_smart_answers]), 
        
        page in ['/visit-europe-brexit',
            '/apply-company-tachograph-card',
            '/cymraeg',
            '/guidance/apprenticeship-funding-rules'],
        any([pagepath in page for pagepath in [
            '/food-premises-approval','/marriage-abroad',
            '/guidance/transport-goods-out-of-the-uk-by-road-if-the-uk-leaves-the-eu-without-a-deal-checklist-for-hauliers',
            '/check-british-citizenship','/renew-driving-licence']])
        ])

In [66]:
def split_daily_data(file_prefix):
    print(f"reading {file_prefix} data")
    df = pd.read_csv(
        os.path.join(DATA_DIR, f'processed_journey/taxon_ab_{file_prefix}.csv.gz'), 
        sep='\t', 
        usecols=REQUIRED_COLUMNS)
    print("page_event_list to literal list")
    df['Page_Event_List'] = df['Page_Event_List'].apply(ast.literal_eval)
    print("Derive var: there_is_atleastone_loved_page")
    # filter on Page_Event_List becuase sometimes it doesn't match but Page_List is created from Page_Event_list so will always inlcude more
    df['pages'] = df.Page_Event_List.apply(lambda x: [triple[0] for triple in x])
    df['there_is_atleastone_loved_page'] = df.pages.map(lambda x: any([is_loved_page(page) for page in x]))
    print("Derive var: is_loved_journey")
    df = df.assign(is_loved_journey = np.where(df.there_is_atleastone_loved_page==1, True, False))
    
    print("Number of occurences of journeys of this type")
    print(df[['Occurrences', 'is_loved_journey']].groupby('is_loved_journey').sum())
    percent = df[['Occurrences', 'is_loved_journey']].groupby('is_loved_journey').sum().iloc[0]/(df[['Occurrences', 
                                                    'is_loved_journey']].groupby('is_loved_journey').sum().iloc[0] + 
                                                df[['Occurrences', 'is_loved_journey']].groupby('is_loved_journey').sum().iloc[1])
    print("{:2.2%} of journeys are unloved on {}, a {}".format(percent.item(), file_prefix, datetime.strptime(file_prefix, '%Y-%m-%d').strftime("%A")))
    print("writing files")

    df[df['is_loved_journey']==False].to_csv(os.path.join(DATA_DIR, f'processed_journey/unloved_{file_prefix}.csv.gz'), 
                                             sep="\t", 
                                             compression="gzip", index=False)
    df[df['is_loved_journey']==True].to_csv(
        os.path.join(DATA_DIR, f'processed_journey/loved_{file_prefix}.csv.gz'), 
        sep="\t", 
        compression="gzip", 
        index=False)

In [67]:
combined_loved_and_unloved = sorted(glob.glob(
        f'{DATA_DIR}/processed_journey/taxon_ab_*.csv.gz'))

In [68]:
combined_loved_and_unloved

['/Users/suganyasivaskantharajah/code/govuk_ab_analysis/data/processed_journey/taxon_ab_2019-03-23.csv.gz',
 '/Users/suganyasivaskantharajah/code/govuk_ab_analysis/data/processed_journey/taxon_ab_2019-03-24.csv.gz',
 '/Users/suganyasivaskantharajah/code/govuk_ab_analysis/data/processed_journey/taxon_ab_2019-03-25.csv.gz',
 '/Users/suganyasivaskantharajah/code/govuk_ab_analysis/data/processed_journey/taxon_ab_2019-03-26.csv.gz',
 '/Users/suganyasivaskantharajah/code/govuk_ab_analysis/data/processed_journey/taxon_ab_2019-03-27.csv.gz']

In [69]:
for file in combined_loved_and_unloved:
    date = re.search(r'\d{4}-\d{2}-\d{2}', file)
    split_daily_data(date.group())

reading 2019-03-23 data
page_event_list to literal list
Derive var: there_is_atleastone_loved_page
Derive var: is_loved_journey
Number of occurences of journeys of this type
                  Occurrences
is_loved_journey             
False                  498577
True                  1722339
22.45% of journeys are unloved on 2019-03-23, a Saturday
writing files
reading 2019-03-24 data
page_event_list to literal list
Derive var: there_is_atleastone_loved_page
Derive var: is_loved_journey
Number of occurences of journeys of this type
                  Occurrences
is_loved_journey             
False                  478630
True                  1709592
21.87% of journeys are unloved on 2019-03-24, a Sunday
writing files
reading 2019-03-25 data
page_event_list to literal list
Derive var: there_is_atleastone_loved_page
Derive var: is_loved_journey
Number of occurences of journeys of this type
                  Occurrences
is_loved_journey             
False                 1128418
True    