In [82]:
import os 
import pandas as pd
import numpy as np
import ast
import re
import json

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

In [83]:
DATA_DIR = os.getenv("DATA_DIR")

# Page-level

### Read in data (content items)

In [84]:
clean_content_df = pd.read_csv(os.path.join(DATA_DIR, 'metadata/clean_content_links.csv')
                              )

### Keep only pages with related links

In [85]:
# only select rows that have related links (they have 
# related_mainstream_content, ordered_related_items, or quick_links)
clean_content_rl_df = clean_content_df.copy().query(
    'related_mainstream_content.notnull() or ordered_related_items.notnull()or part_of_step_navs.notnull() or quick_links.notnull()'
    )

### Tidy related link pages data

In [86]:
# fill NaNs with empty arrays, and then literal_eval all the arrays so we can
# access the items within them (the different slugs associated with each
# content ID)
clean_content_rl_df['slugs'] = clean_content_rl_df['slugs'].fillna("['']").apply(
    ast.literal_eval)

This converts the string to a list and puts an empty list where there are none

In [87]:
clean_content_rl_df.reset_index(inplace=True, drop=True)

In [88]:
# sometimes there isn't an empty slug in the list of slugs, but the page path
# exists, so this is a little hack to includ the plain basePath
def add_dummy_slug(slugs):
    list1 = ['']
    list1.extend(slugs)
    return list(set(list1))

dummmy_slug is the url without any slug which  is also a page so needs an empty slug

In [89]:
clean_content_rl_df['slugs'] = clean_content_rl_df['slugs'].apply(add_dummy_slug)

wide to long

In [90]:
# adapted from https://gist.github.com/jlln/338b4b0b55bd6984f883
def splitDataFrameList(df,target_column):
    '''
    df = dataframe to split,
    target_column = the column containing the values to split, in an array
    returns: a dataframe with each entry for the target column separated,
        with each element moved into a new row. The values in the other
        columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column):
        for s in row[target_column]:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [91]:
clean_content_exploded_df = splitDataFrameList(
    clean_content_rl_df, 'slugs')

In [92]:
def add_slash_between_basepath_slug(slug):
    if slug == '':
        return ''
    else:
        return '/' + slug

In [93]:
clean_content_exploded_df['slug'] = clean_content_exploded_df['slugs'].map(
    add_slash_between_basepath_slug)

In [94]:
clean_content_exploded_df['pagePath'] = clean_content_exploded_df['base_path']  + clean_content_exploded_df['slug']


In [95]:
clean_content_exploded_df.to_csv(os.path.join(DATA_DIR, 'metadata/loved_pages.csv.gz'),
                        compression='gzip', index=False)

SMART answers needed separately as each answer gets a new slug so the beginning of the url is matched to classify these as loved

In [15]:
clean_content_exploded_df[
    clean_content_exploded_df['document_type'] == 'simple_smart_answer'].to_csv(os.path.join(DATA_DIR, 'metadata/loved_smart_answers.csv.gz'),
                        compression='gzip', index=False)

# Journey-level

In [16]:
REQUIRED_COLUMNS = ["Occurrences", "ABVariant", "Page_Event_List",
                    "Page_List",  "Event_cat_act_agg"]

In [17]:
loved_pages_df = pd.read_csv(
    os.path.join(DATA_DIR, 'metadata/loved_pages.csv.gz'),
    usecols=['pagePath'])

In [18]:
# dedupe the pagePaths here just in case
loved_page_paths = list(set(loved_pages_df['pagePath'].tolist()))

In [20]:
loved_page_paths_set = set(loved_page_paths)

In [21]:
loved_smart_answers_df = pd.read_csv(
    os.path.join(DATA_DIR, 'metadata/loved_smart_answers.csv.gz'),
    usecols=['pagePath'])

In [22]:
loved_smart_answers_df['pagePath'] = loved_smart_answers_df['pagePath'] + '/'

In [23]:
loved_smart_answers = list(set(
    loved_smart_answers_df['pagePath'].tolist()))

the pages come from links in this page https://www.gov.uk/government/organisations/hm-revenue-customs/contact

In [24]:
with open(os.path.join(DATA_DIR, 'metadata/hmrc_contact_pages.json'), "r") as read_file:
    contact_pages = json.load(read_file)

In [25]:
hmrc_contact_pages = [link['base_path'] for link in contact_pages['links']['children']]

In [26]:
hmrc_contact_pages_set = set(hmrc_contact_pages)

In [61]:
def is_loved_page(page):
    return any([
        re.match('/foreign-travel-advice/',page),
        page in hmrc_contact_pages_set,
        page in loved_page_paths_set,
        page == '/help',
        re.match('/premises-licence/',page),
        #is_loved_page(page.split('?')[0]),
        page in ['/help/terms-conditions', 
         '/help/about-govuk',
         '/help/accessibility', 
         '/help/privacy-policy',
         '/help/cookies', 
         '/help/update-email-notifications',
         '/help/browsers', 
         '/help/beta'],
        re.match('/find-local-council/',page),
        any([pagepath in page for pagepath in loved_smart_answers]), 
        ])

# filter on Page_Event_List too in case it doesn't match Page_List - e.g. 
# when page hits happen before midnight but events happen after?
def is_loved_page_event_list(page_event_list):
    return any([is_loved_page(triple[0]) for triple in page_event_list])

In [106]:
def split_daily_data(file_prefix):
    print("reading data")
    df = pd.read_csv(
        os.path.join(DATA_DIR, f'processed_journey/taxon_ab_{file_prefix}.csv.gz'), 
        sep='\t', 
        usecols=REQUIRED_COLUMNS)
    print("page_list to literal list")
    df['Page_List'] = df['Page_List'].apply(ast.literal_eval)
    print("Derive var: there_is_atleastone_loved_page")
    df['there_is_atleastone_loved_page'] = df.Page_List.map(lambda x: any([is_loved_page(page.split('?')[0]) for page in x]))
    print("Derive var: there_is_atleastone_loved_event")
    df['there_is_atleastone_loved_event'] = is_loved_page_event_list(df['Page_Event_List'])
    print("Derive var: is_loved_journey")
    df = df.assign(is_loved_journey = np.where((df.there_is_atleastone_loved_page==1) |
                                                 (df.there_is_atleastone_loved_event==1), 
                                                 True, False))
    
    print("Number of occurences of journeys of this type")
    print(df.groupby('is_loved_journey').sum())
    
    print("writing files")

    df[df['is_loved_journey']==False].to_csv(os.path.join(DATA_DIR, f'processed_journey/unloved_{file_prefix}.csv.gz'), 
                                             sep="\t", 
                                             compression="gzip", index=False)
    df[df['is_loved_journey']==True].to_csv(
        os.path.join(DATA_DIR, f'processed_journey/loved_{file_prefix}.csv.gz'), 
        sep="\t", 
        compression="gzip", 
        index=False)

In [103]:
split_daily_data('2019-02-15')


  0%|          | 0/1294250 [00:00<?, ?it/s][A
  0%|          | 5793/1294250 [00:00<00:22, 57923.70it/s][A
  1%|          | 11139/1294250 [00:00<00:22, 56505.73it/s][A
  1%|▏         | 16195/1294250 [00:00<00:23, 54578.22it/s][A
  2%|▏         | 21392/1294250 [00:00<00:23, 53766.70it/s][A
  2%|▏         | 26502/1294250 [00:00<00:23, 52931.66it/s][A
  2%|▏         | 31518/1294250 [00:00<00:24, 52068.34it/s][A
  3%|▎         | 36645/1294250 [00:00<00:24, 51825.62it/s][A
  3%|▎         | 41720/1294250 [00:00<00:24, 51497.07it/s][A
  4%|▎         | 46942/1294250 [00:00<00:24, 51710.61it/s][A
  4%|▍         | 52115/1294250 [00:01<00:24, 51714.26it/s][A
  4%|▍         | 57127/1294250 [00:01<00:24, 50822.16it/s][A
  5%|▍         | 62101/1294250 [00:01<00:24, 50177.97it/s][A
  5%|▌         | 67045/1294250 [00:01<00:24, 49820.50it/s][A
  6%|▌         | 71976/1294250 [00:01<00:24, 49214.45it/s][A
  6%|▌         | 76867/1294250 [00:01<00:24, 49120.65it/s][A
  6%|▋         | 81756/

Number of occurences of journeys of this type
                  Occurrences  there_is_atleastone_loved_page  \
is_loved_journey                                                
False                  832970                             0.0   
True                  2486220                        962314.0   

                  there_is_atleastone_loved_event  
is_loved_journey                                   
False                                         0.0  
True                                          0.0  


In [105]:
split_daily_data('2019-02-16')

reading data
page_list to literal list



  0%|          | 0/810323 [00:00<?, ?it/s][A
  0%|          | 2479/810323 [00:00<00:32, 24779.39it/s][A


Derive var: there_is_atleastone_loved_page


  1%|          | 5193/810323 [00:00<00:31, 25442.25it/s][A
  1%|          | 7810/810323 [00:00<00:31, 25655.81it/s][A
  1%|▏         | 10504/810323 [00:00<00:30, 26027.82it/s][A
  2%|▏         | 13198/810323 [00:00<00:30, 26293.82it/s][A
  2%|▏         | 15792/810323 [00:00<00:30, 26186.28it/s][A
  2%|▏         | 18344/810323 [00:00<00:30, 25978.09it/s][A
  3%|▎         | 21108/810323 [00:00<00:29, 26454.97it/s][A
  3%|▎         | 23716/810323 [00:00<00:29, 26336.58it/s][A
  3%|▎         | 26464/810323 [00:01<00:29, 26657.16it/s][A
  4%|▎         | 29278/810323 [00:01<00:28, 27083.31it/s][A
  4%|▍         | 32023/810323 [00:01<00:28, 27191.08it/s][A
  4%|▍         | 34711/810323 [00:01<00:28, 27070.06it/s][A
  5%|▍         | 37475/810323 [00:01<00:28, 27231.81it/s][A
  5%|▍         | 40183/810323 [00:01<00:28, 26992.10it/s][A
  5%|▌         | 42873/810323 [00:01<00:28, 26937.97it/s][A
  6%|▌         | 45584/810323 [00:01<00:28, 26964.30it/s][A
  6%|▌         | 48359/810

Derive var: there_is_atleastone_loved_event
Derive var: is_loved_journey
Number of occurences of journeys of this type
                  Occurrences  there_is_atleastone_loved_page  \
is_loved_journey                                                
False                  425419                             0.0   
True                  1619383                        655696.0   

                  there_is_atleastone_loved_event  
is_loved_journey                                   
False                                         0.0  
True                                          0.0  
writing files


In [107]:
split_daily_data('2019-02-17')

reading data
page_list to literal list
Derive var: there_is_atleastone_loved_page
Derive var: there_is_atleastone_loved_event
Derive var: is_loved_journey
Number of occurences of journeys of this type
                  Occurrences  there_is_atleastone_loved_page  \
is_loved_journey                                                
False                  429250                             0.0   
True                  1642816                        684105.0   

                  there_is_atleastone_loved_event  
is_loved_journey                                   
False                                         0.0  
True                                          0.0  
writing files


In [None]:
split_daily_data('2019-02-18')

reading data
