This notebook provides code to read in processed journey data including the taxon variable. We then demonstrate how to rank user journeys (`Sequences`) by `Occurences`. We finally create a page taxon dictionary.

In [2]:
import os
import pandas as pd

In [3]:
from ast import literal_eval

In [4]:
output = os.path.join(os.getenv("DATA_DIR"),"output")

In [5]:
result_path = os.path.join(output, "brexit_taxon_29_04.csv.gz")
result_path

'/Users/felisialoukou/Documents/govuk-network-data/data/output/brexit_taxon_29_04.csv.gz'

In [6]:
df = pd.read_csv(result_path, sep="\t", compression="gzip")

In [12]:
df.shape

(77112, 17)

In [7]:
df.head()

Unnamed: 0,Occurrences,DeviceCategories,PageSeq_Length,Actions_Length,Dates,Sequence,PageSequence,Page_Event_List,Page_List,Event_List,num_event_cats,Event_cats_agg,Event_cat_act_agg,Taxon_List,Taxon_Page_List,Page_List_NL,Page_Seq_NL
0,1,"[('mobile', 1)]",2,2,"[('20181029', 1)]",/government/news/new-fund-to-support-vulnerabl...,/government/news/new-fund-to-support-vulnerabl...,[('/government/news/new-fund-to-support-vulner...,['/government/news/new-fund-to-support-vulnera...,"[('PAGE_NULL', 'PAGE_NULL'), ('PAGE_NULL', 'PA...",1,"[('PAGE_NULL', 2)]","[(('PAGE_NULL', 'PAGE_NULL'), 2)]","['d6c2de5d-ef90-45d1-82d4-5f2438369eea,7f3e73e...",[('/government/news/new-fund-to-support-vulner...,['/government/news/new-fund-to-support-vulnera...,/government/news/new-fund-to-support-vulnerabl...
1,1,"[('mobile', 1)]",2,2,"[('20181029', 1)]",/guidance/use-software-to-submit-your-vat-retu...,/guidance/use-software-to-submit-your-vat-retu...,[('/guidance/use-software-to-submit-your-vat-r...,['/guidance/use-software-to-submit-your-vat-re...,"[('PAGE_NULL', 'PAGE_NULL'), ('PAGE_NULL', 'PA...",1,"[('PAGE_NULL', 2)]","[(('PAGE_NULL', 'PAGE_NULL'), 2)]","['b20215a9-25fb-4fa6-80a3-42e23f5352c2,426bf4a...",[('/guidance/use-software-to-submit-your-vat-r...,['/guidance/use-software-to-submit-your-vat-re...,/guidance/use-software-to-submit-your-vat-retu...
2,1,"[('mobile', 1)]",2,2,"[('20181029', 1)]",/guidance/get-funding-to-start-a-charity<<PAGE...,/guidance/get-funding-to-start-a-charity>>/gui...,"[('/guidance/get-funding-to-start-a-charity', ...","['/guidance/get-funding-to-start-a-charity', '...","[('PAGE_NULL', 'PAGE_NULL'), ('PAGE_NULL', 'PA...",1,"[('PAGE_NULL', 2)]","[(('PAGE_NULL', 'PAGE_NULL'), 2)]","['668cd623-c7a8-4159-9575-90caac36d4b4,71882e9...","[('/guidance/get-funding-to-start-a-charity', ...","['/guidance/get-funding-to-start-a-charity', '...",/guidance/get-funding-to-start-a-charity>>/gui...
3,3,"[('mobile', 3)]",2,2,"[('20181029', 1), ('20181030', 1), ('20181104'...",/guidance/living-in-sweden<<PAGE<:<NULL<:<NULL...,/guidance/living-in-sweden>>/foreign-travel-ad...,"[('/guidance/living-in-sweden', 'PAGE<:<NULL<:...","['/guidance/living-in-sweden', '/foreign-trave...","[('PAGE_NULL', 'PAGE_NULL'), ('PAGE_NULL', 'PA...",1,"[('PAGE_NULL', 2)]","[(('PAGE_NULL', 'PAGE_NULL'), 2)]","['d6c2de5d-ef90-45d1-82d4-5f2438369eea,3dbeb4a...","[('/guidance/living-in-sweden', ('d6c2de5d-ef9...","['/guidance/living-in-sweden', '/foreign-trave...",/guidance/living-in-sweden>>/foreign-travel-ad...
4,2,"[('mobile', 2)]",2,2,"[('20181029', 1), ('20181101', 1)]",/guidance/eu-settlement-scheme-evidence-of-uk-...,/guidance/eu-settlement-scheme-evidence-of-uk-...,[('/guidance/eu-settlement-scheme-evidence-of-...,['/guidance/eu-settlement-scheme-evidence-of-u...,"[('PAGE_NULL', 'PAGE_NULL'), ('PAGE_NULL', 'PA...",1,"[('PAGE_NULL', 2)]","[(('PAGE_NULL', 'PAGE_NULL'), 2)]","['d6c2de5d-ef90-45d1-82d4-5f2438369eea,06e2928...",[('/guidance/eu-settlement-scheme-evidence-of-...,['/guidance/eu-settlement-scheme-evidence-of-u...,/guidance/eu-settlement-scheme-evidence-of-uk-...


In [8]:
df.sort_values("Occurrences",ascending=False).Page_List

7573     ['/government/publications/amendments-to-tax-l...
5178     ['/settled-status-eu-citizens-families', '/set...
12514    ['/guidance/passport-rules-for-travel-to-europ...
8789     ['/settled-status-eu-citizens-families', '/set...
24604    ['/government/collections/eu-settlement-scheme...
44296    ['/government/publications/eu-settlement-schem...
8807     ['/settled-status-eu-citizens-families/applyin...
6347     ['/settled-status-eu-citizens-families', '/set...
6344     ['/settled-status-eu-citizens-families/applyin...
14057    ['/government/collections/eu-settlement-scheme...
5689     ['/guidance/passport-rules-for-travel-to-europ...
3828     ['/government/publications/taking-your-pet-abr...
8192     ['/government/collections/data-protection-act-...
7572     ['/guidance/passport-rules-for-travel-to-europ...
20049    ['/government/publications/cmas-role-if-theres...
6346     ['/settled-status-eu-citizens-families', '/set...
26048    ['/government/collections/eu-settlement-scheme.

In [9]:
df.columns

Index(['Occurrences', 'DeviceCategories', 'PageSeq_Length', 'Actions_Length',
       'Dates', 'Sequence', 'PageSequence', 'Page_Event_List', 'Page_List',
       'Event_List', 'num_event_cats', 'Event_cats_agg', 'Event_cat_act_agg',
       'Taxon_List', 'Taxon_Page_List', 'Page_List_NL', 'Page_Seq_NL'],
      dtype='object')

In [10]:
for column in df.columns:
    if "List" in column:
        print(column)
        df[column] = df[column].map(literal_eval)

Page_Event_List
Page_List
Event_List
Taxon_List
Taxon_Page_List
Page_List_NL


In [11]:
for tup in df[0:10].itertuples():
    for page in tup.Taxon_Page_List:
        print(page)

('/government/news/new-fund-to-support-vulnerable-eu-citizens-apply-for-settled-status', ('d6c2de5d-ef90-45d1-82d4-5f2438369eea', '7f3e73e4-f20d-43be-9c80-021f2ac1897f'))
('/email/authenticate?address=[email]', ('other',))
('/guidance/use-software-to-submit-your-vat-returns', ('b20215a9-25fb-4fa6-80a3-42e23f5352c2', '426bf4a1-865b-4e4a-81ef-b2abeab6a39d'))
('/government/publications/partnership-pack-preparing-for-a-no-deal-eu-exit', ('d6c2de5d-ef90-45d1-82d4-5f2438369eea', '5984199c-d85c-4ffc-b13a-c5f2482d2258'))
('/guidance/get-funding-to-start-a-charity', ('668cd623-c7a8-4159-9575-90caac36d4b4', '71882e94-5491-49d2-b2f8-9f7e84611080'))
('/guidance/eu-settlement-scheme-pilot-applicant-eligibility', ('d6c2de5d-ef90-45d1-82d4-5f2438369eea', '7f3e73e4-f20d-43be-9c80-021f2ac1897f'))
('/guidance/living-in-sweden', ('d6c2de5d-ef90-45d1-82d4-5f2438369eea', '3dbeb4a3-33c0-4bda-bd21-b721b0f8736f', 'ecd9e694-366d-4014-83c0-081a4af04e92'))
('/foreign-travel-advice/sweden/safety-and-security', ('

In [17]:
page_taxon_dict = {}
# [df.PageSeq_Length>2][0:5]
for tup in df.itertuples():
    for page, taxons in tup.Taxon_Page_List:
#         print(page,":::",taxons)
        if page not in page_taxon_dict.keys():
            page_taxon_dict[page] = taxons
#     print("++++")

In [18]:
len(page_taxon_dict)

45401