In [1]:
import os 
import pandas as pd

import ast

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

In [2]:
clean_content_df = pd.read_csv(
    '/Users/suganyasivaskantharajah/code/govuk-taxonomy-supervised-learning/data/clean_content_links.csv')

In [3]:
clean_content_df.shape

(236590, 14)

In [4]:
clean_content_df.head()

Unnamed: 0,base_path,content_id,document_type,primary_publishing_organisation,publishing_app,title,ordered_related_items,quick_links,related_mainstream_content,related_guides,document_collections,part_of_step_navs,related_to_step_navs,slugs
0,/government/publications/list-of-psychologists...,04a0cc0d-0b9f-45ad-bf57-7c54cbab9df9,guidance,Foreign & Commonwealth Office,whitehall,chile - list of psychologists and psychiatrist...,,,,,,,,
1,/government/statistics/uk-labour-market-statis...,a61985b0-d6eb-4cf1-8140-642b9557ce00,national_statistics,Office for National Statistics,whitehall,uk labour market statistics: may 2017,,,,,,,,
2,/government/publications/monitor-remuneration-...,d569ef4b-d632-49a0-9795-6a7ea934b799,transparency,Monitor,whitehall,monitor: remuneration committee papers october...,,,,,['5f4c7b0f-7631-11e4-a3cb-005056011aef'],,,
3,/government/statistical-data-sets/env26-expend...,5e0fee54-7631-11e4-a3cb-005056011aef,statistical_data_set,"Department for Environment, Food & Rural Affairs",whitehall,env26 - expenditure on biodiversity,,,,,['5eb6d242-7631-11e4-a3cb-005056011aef'],,,
4,/government/publications/hmg-spending-moratori...,581cabaf-2ed1-4411-8d1b-35b1bcb559b5,transparency,Department for International Development,whitehall,hmg spending moratoria: dfid ict january to ma...,,,,,['5eb71ddb-7631-11e4-a3cb-005056011aef'],,,


In [5]:
clean_content_df.count()

base_path                          236590
content_id                         236590
document_type                      236590
primary_publishing_organisation    223219
publishing_app                     236590
title                              236590
ordered_related_items                1935
quick_links                             0
related_mainstream_content            312
related_guides                       1086
document_collections                52778
part_of_step_navs                     170
related_to_step_navs                   69
slugs                                 775
dtype: int64

In [6]:
clean_content_df[
    clean_content_df['related_mainstream_content'].notnull() |
    clean_content_df['ordered_related_items'].notnull() |
    clean_content_df['part_of_step_navs']].count()

base_path                          2290
content_id                         2290
document_type                      2290
primary_publishing_organisation    2259
publishing_app                     2290
title                              2290
ordered_related_items              1935
quick_links                           0
related_mainstream_content          312
related_guides                      141
document_collections                134
part_of_step_navs                   170
related_to_step_navs                 57
slugs                               749
dtype: int64

In [10]:
# only select rows that have related links (they have 
# related_mainstream_content, ordered_related_items, or quick_links)
clean_content_df.query(
    'related_mainstream_content.notnull() or ordered_related_items.notnull()or part_of_step_navs.notnull() or quick_links.notnull()'
    , inplace=True)

In [11]:
clean_content_df.shape

(2290, 14)

In [12]:
# fill NaNs with empty arrays, and then literal_eval all the arrays so we can
# access the items within them (the different slugs associated with each
# content ID)
clean_content_df['slugs'] = clean_content_df['slugs'].fillna("['']").apply(
    ast.literal_eval)

In [13]:
clean_content_df.reset_index(inplace=True, drop=True)

In [14]:
clean_content_df['slugs'].head()

0                                                   []
1    [overview, rules-for-class-3-invalid-carriages...
2    [overview, what-you-can-get, eligibility, how-...
3                                                   []
4    [overview, how-it-works, calculating-dea, what...
Name: slugs, dtype: object

In [15]:
clean_content_df['slugs'][0]

['']

In [16]:
clean_content_df['slugs'][1]

['overview',
 'rules-for-class-3-invalid-carriages',
 'driving-on-the-road',
 'driving-on-footpaths-and-parking',
 'eyesight-requirements',
 'use-by-nondisabled-people',
 'vehicle-tax-registration-and-insurance']

In [17]:
# sometimes there isn't an empty slug in the list of slugs, but the page path
# exists, so this is a little hack to includ the plain basePath
def add_dummy_slug(slugs):
    list1 = ['']
    list1.extend(slugs)
    return list(set(list1))

In [18]:
add_dummy_slug(clean_content_df['slugs'][0])

['']

In [19]:
clean_content_df['slugs'] = clean_content_df['slugs'].apply(add_dummy_slug)

In [20]:
# adapted from https://gist.github.com/jlln/338b4b0b55bd6984f883
def splitDataFrameList(df,target_column):
    '''
    df = dataframe to split,
    target_column = the column containing the values to split, in an array
    returns: a dataframe with each entry for the target column separated,
        with each element moved into a new row. The values in the other
        columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column):
        for s in row[target_column]:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [21]:
clean_content_exploded_df = splitDataFrameList(
    clean_content_df, 'slugs')

In [22]:
def overview_to_nothing(slug):
    if slug == '':
        return ''
    elif slug == 'overview':
        return ''
    else:
        return '/' + slug

In [23]:
clean_content_exploded_df['slug'] = clean_content_exploded_df['slugs'].map(
    overview_to_nothing)

In [24]:
clean_content_exploded_df['pagePath'] = clean_content_exploded_df['base_path']  + clean_content_exploded_df['slug']


In [25]:
clean_content_exploded_df.head()

Unnamed: 0,base_path,content_id,document_collections,document_type,ordered_related_items,part_of_step_navs,primary_publishing_organisation,publishing_app,quick_links,related_guides,related_mainstream_content,related_to_step_navs,slugs,title,slug,pagePath
0,/company-voluntary-arrangements,6fafa451-96b4-4d38-a97d-5261c8a62dd9,,answer,"['86e9a768-4162-4d75-9744-f92f56f461ae', 'a0c7...",,Government Digital Service,publisher,,,,,,company voluntary arrangements,,/company-voluntary-arrangements
1,/mobility-scooters-and-powered-wheelchairs-rules,b3202d1a-f26d-49a2-8f2b-0c7923ba410d,,guide,"['fa748fae-3de4-4266-ae85-0797ada3f40c', '2157...",,Government Digital Service,publisher,,,,,,mobility scooters and powered wheelchairs: the...,,/mobility-scooters-and-powered-wheelchairs-rules
2,/mobility-scooters-and-powered-wheelchairs-rules,b3202d1a-f26d-49a2-8f2b-0c7923ba410d,,guide,"['fa748fae-3de4-4266-ae85-0797ada3f40c', '2157...",,Government Digital Service,publisher,,,,,overview,mobility scooters and powered wheelchairs: the...,,/mobility-scooters-and-powered-wheelchairs-rules
3,/mobility-scooters-and-powered-wheelchairs-rules,b3202d1a-f26d-49a2-8f2b-0c7923ba410d,,guide,"['fa748fae-3de4-4266-ae85-0797ada3f40c', '2157...",,Government Digital Service,publisher,,,,,rules-for-class-3-invalid-carriages,mobility scooters and powered wheelchairs: the...,/rules-for-class-3-invalid-carriages,/mobility-scooters-and-powered-wheelchairs-rul...
4,/mobility-scooters-and-powered-wheelchairs-rules,b3202d1a-f26d-49a2-8f2b-0c7923ba410d,,guide,"['fa748fae-3de4-4266-ae85-0797ada3f40c', '2157...",,Government Digital Service,publisher,,,,,eyesight-requirements,mobility scooters and powered wheelchairs: the...,/eyesight-requirements,/mobility-scooters-and-powered-wheelchairs-rul...


In [26]:
clean_content_exploded_df[clean_content_exploded_df['base_path'] == '/deposit-protection-schemes-and-landlords']

Unnamed: 0,base_path,content_id,document_collections,document_type,ordered_related_items,part_of_step_navs,primary_publishing_organisation,publishing_app,quick_links,related_guides,related_mainstream_content,related_to_step_navs,slugs,title,slug,pagePath
3098,/deposit-protection-schemes-and-landlords,806a3397-878a-492f-977a-a5a2afbf249c,,guide,"['fd824dab-4f80-4a20-a7fd-b48c2c2f70f8', '3c65...",,Government Digital Service,publisher,,,,,,deposit protection schemes and landlords,,/deposit-protection-schemes-and-landlords
3099,/deposit-protection-schemes-and-landlords,806a3397-878a-492f-977a-a5a2afbf249c,,guide,"['fd824dab-4f80-4a20-a7fd-b48c2c2f70f8', '3c65...",,Government Digital Service,publisher,,,,,overview,deposit protection schemes and landlords,,/deposit-protection-schemes-and-landlords
3100,/deposit-protection-schemes-and-landlords,806a3397-878a-492f-977a-a5a2afbf249c,,guide,"['fd824dab-4f80-4a20-a7fd-b48c2c2f70f8', '3c65...",,Government Digital Service,publisher,,,,,information-you-must-give-to-your-tenants,deposit protection schemes and landlords,/information-you-must-give-to-your-tenants,/deposit-protection-schemes-and-landlords/info...
3101,/deposit-protection-schemes-and-landlords,806a3397-878a-492f-977a-a5a2afbf249c,,guide,"['fd824dab-4f80-4a20-a7fd-b48c2c2f70f8', '3c65...",,Government Digital Service,publisher,,,,,disputes,deposit protection schemes and landlords,/disputes,/deposit-protection-schemes-and-landlords/disp...
3102,/deposit-protection-schemes-and-landlords,806a3397-878a-492f-977a-a5a2afbf249c,,guide,"['fd824dab-4f80-4a20-a7fd-b48c2c2f70f8', '3c65...",,Government Digital Service,publisher,,,,,if-you-dont-protect-your-tenants-deposit,deposit protection schemes and landlords,/if-you-dont-protect-your-tenants-deposit,/deposit-protection-schemes-and-landlords/if-y...


In [28]:
clean_content_exploded_df.shape

(5977, 16)

In [43]:
clean_content_exploded_df.drop_duplicates().shape

(5977, 16)

In [40]:
len(set(clean_content_exploded_df['pagePath']))
# we should dedupe the list wehen we read it

5496

In [44]:
clean_content_exploded_df.to_csv('/Users/suganyasivaskantharajah/code/govuk_ab_analysis/loved_pages.csv.gz',
                        compression='gzip', index=False)