In [182]:
import os 
import pandas as pd
import numpy as np
from ast import literal_eval
from collections import Counter
from operator import itemgetter

### File/dir locations

In [75]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "preprocessed_taxon_pageseq_20190114_20190116.csv.gz"
df_file = os.path.join(DATA_DIR, "processed_journey", filename)
df_rel_file = os.path.join(DATA_DIR, "processed_journey", "rel_"+filename)

### Load up unfiltered data: 14-16/01/19

In [3]:
# df = pd.read_csv(df_file, sep="\t", compression = "gzip")
# df.shape

**TODO: need to run this at some point to compare against "related content" link-specific dataset**

There seems to be an indication that there are no major differences

In [None]:
# og_df_devices = Counter()
# for item in df.DeviceCategories.values:
#     for key,value in item:
#         og_df_devices[key]+=value

Original dataset shape (5048130, 17)

### Prepare dataset: keep only journeys containing "relatedLinkClicked" event
Requires further filtering because eventAction need to be "Related link" to avoid the inclusion of "Explore the topic" links.

In [5]:
# df_rel = df[df.Sequence.str.contains("relatedLinkClicked")].copy(deep=True)
# df_rel.to_csv(df_rel_file, sep="\t", compression = "gzip")

### Evaluate journeys with related links

In [6]:
df_rel = pd.read_csv(df_rel_file, sep="\t", compression = "gzip")

In [10]:
df_rel.drop("Unnamed: 0",axis=1,inplace=True)

In [14]:
df_rel['Page_Seq_Occurrences'] = df_rel.groupby('PageSequence')['Occurrences'].transform('sum')

In [7]:
df_rel.shape

(592902, 18)

In [None]:
df_rel.head()

In [15]:
occ_shape = df_rel[df_rel.Page_Seq_Occurrences==1].shape[0]

In [36]:
"{}% of user journeys with all related links in 14-16/01 time period occur only once.".format(float('{:.6g}'.
                                                                        format((occ_shape*100)/df_rel.shape[0])))

'85.21% of user journeys with all related links in 14-16/01 time period occur only once.'

### Keep only journeys that are interacting with the "Related content" links

In [40]:
df_relc = df_rel[df_rel.Sequence.str.contains("Related content")].copy(deep=True)

In [194]:
relc_shape = df_relc.shape[0]
relc_shape

395777

In [195]:
df_relc.Occurrences.sum()

473975

In [45]:
"{}% of user journeys include only \"Related content\" links".format(float('{:.4g}'.
                                                                        format((relc_shape*100)/df_rel.shape[0])))

'66.75% of user journeys include only "Related content" links'

In [46]:
df_relc.Page_Seq_Occurrences.describe()

count    394602.000000
mean         16.854377
std         150.269932
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max        5467.000000
Name: Page_Seq_Occurrences, dtype: float64

In [59]:
columns = ['DeviceCategories', 'Dates', 'Page_Event_List', 'Page_List', 
           'Event_List',  'Event_cats_agg', 'Event_cat_act_agg',
           'Taxon_List', 'Taxon_Page_List', 'Page_List_NL']

In [69]:
for column in columns:
    print(column)
    if not isinstance(df_relc[column].iloc[0],list):
        df_relc[column] = df_relc[column].map(literal_eval)

DeviceCategories
Dates
Page_Event_List
Page_List
Event_List
Event_cats_agg
Event_cat_act_agg
Taxon_List
Taxon_Page_List
Page_List_NL


### Count how many related links there are per journey

In [126]:
df_relc.Event_cat_act_agg.iloc[2]

[(('PAGE_NULL', 'PAGE_NULL'), 24),
 (('contentsClicked', 'content_item 2'), 1),
 (('relatedLinkClicked', '1.4 Related content'), 1),
 (('contentsClicked', 'next'), 1),
 (('Simple Smart Answer', 'Completed'), 1),
 (('verify-hint', 'shown'), 2),
 (('Radio button chosen', 'sign-in-with-government-gateway'), 1),
 (('Radio button chosen', 'sign-in-with-government-gateway-with-hint'), 1),
 (('Radio button chosen', 'sign-in-with-gov-uk-verify'), 2),
 (('Radio button chosen', 'sign-in-with-gov-uk-verify-with-hint'), 2),
 (('webchat', 'busy'), 3),
 (('breadcrumbClicked', '3'), 1),
 (('filterClicked', 'Search'), 1),
 (('yesNoFeedbackForm', 'ffNoClick'), 1),
 (('homeLinkClicked', 'homeBreadcrumb'), 1),
 (('Radio button chosen', 'register-for-self-assessment'), 1),
 (('Radio button chosen', 'register-for-self-assessment-with-hint'), 1),
 (('webchat', 'available'), 1),
 (('contentsClicked', 'content_item 3'), 1),
 (('External Link Clicked', 'https://online.hmrc.gov.uk/shortforms/form/SA1'),
  2)]

In [130]:
def count_related_links(x):
    return sum([value for key,value in x if "Related content" in key[1]])

In [129]:
extract_related_links(df_relc.Event_cat_act_agg.iloc[2])

1

In [132]:
df_relc['num_rel_cont'] = df_relc['Event_cat_act_agg'].map(count_related_links)

#### Distribution of occurrences and number of related links clicked

In [136]:
occ_and_rel_count = Counter()
for occ, num_rel in zip(df_relc['Occurrences'], df_relc['num_rel_cont']):
    occ_and_rel_count[num_rel] += occ

In [196]:
occ_and_rel_count.most_common()

[(1, 381270),
 (2, 67877),
 (3, 16475),
 (4, 5066),
 (5, 1805),
 (6, 758),
 (7, 322),
 (8, 167),
 (9, 81),
 (10, 52),
 (11, 32),
 (12, 20),
 (13, 13),
 (14, 11),
 (15, 7),
 (17, 5),
 (26, 2),
 (27, 2),
 (23, 2),
 (16, 2),
 (20, 1),
 (45, 1),
 (47, 1),
 (18, 1),
 (19, 1),
 (24, 1)]

In [134]:
df_relc[df_relc['num_rel_cont']>1].shape

(90012, 21)

### Compute device category frequency counts over all journeys

In [71]:
devices = Counter()
for item in df_relc.DeviceCategories.values:
    for key,value in item:
        devices[key]+=value

In [73]:
devices

Counter({'desktop': 256791, 'tablet': 55546, 'mobile': 161638})

In [76]:
df_relc.Event_cat_act_agg.iloc[0]

[(('PAGE_NULL', 'PAGE_NULL'), 38),
 (('secondLevelBrowseLinkClicked', '5'), 11),
 (('thirdLevelBrowseLinkClicked', '1.4'), 1),
 (('Smart Answer', 'Completed'), 2),
 (('relatedLinkClicked', '1.5 Related content'), 1),
 (('contentsClicked', 'content_item 3'), 2),
 (('breadcrumbClicked', '3'), 1),
 (('thirdLevelBrowseLinkClicked', '1.1'), 1),
 (('contentsClicked', 'content_item 2'), 3),
 (('relatedLinkClicked', '1.3 Related content'), 1),
 (('user_satisfaction_survey', 'banner_shown'), 2),
 (('relatedLinkClicked', '1.1 Related content'), 1),
 (('searchResults', 'resultsShown'), 2),
 (('UX', 'click'), 2),
 (('External Link Clicked',
   'https://pensioncreditcalculator.dwp.gov.uk/pension-credit-calculator.php?new'),
  1),
 (('contentsClicked', 'content_item 1'), 2),
 (('pageElementInteraction', 'stepNavShown'), 1),
 (('contentsClicked', 'next'), 1)]

In [94]:
def device_ratio(x):
    mobile = 0
    desktop = 0
    for item,value in x:
        if item == "mobile":
            mobile = value
        elif item == "desktop":
            desktop=value
    return desktop/mobile if mobile != 0 else -1

In [107]:
def percent_desktop(x):
    mobile = 0
    desktop = 0
    for item,value in x:
        if item == "mobile" or item == "tablet":
            mobile += value
        elif item == "desktop":
            desktop = value
    return (desktop*100)/(mobile+desktop)

In [168]:
def device_count(x,device):
    return sum([value for item, value in x if item==device])         

In [180]:
def more_device(x,device):
    return max(x,key=itemgetter(1))[0]==device 

In [108]:
df_relc['DeviceRatio'] =  df_relc['DeviceCategories'].map(device_ratio)

In [141]:
df_relc['DeviceRatio'][0:10]

1     NaN
2     NaN
3     NaN
4     NaN
5     0.0
6     NaN
7     NaN
10    NaN
12    0.0
13    NaN
Name: DeviceRatio, dtype: float64

In [110]:
df_relc['DeviceRatio'].describe()

count    138316.000000
mean          0.059877
std           0.958298
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         154.333333
Name: DeviceRatio, dtype: float64

In [111]:
df_relc['PercentDesktop'] = df_relc['DeviceCategories'].map(percent_desktop)

In [117]:
df_relc.PercentDesktop.describe()

count    395777.000000
mean         53.967820
std          49.484192
min           0.000000
25%           0.000000
50%         100.000000
75%         100.000000
max         100.000000
Name: PercentDesktop, dtype: float64

In [170]:
df_relc["DesktopCount"] = df_relc['DeviceCategories'].map(lambda x: device_count(x,"desktop"))
df_relc["MobileCount"] = df_relc['DeviceCategories'].map(lambda x: device_count(x,"mobile"))                                                         

In [185]:
print(df_relc['DeviceCategories'].iloc[0])
more_device(df_relc['DeviceCategories'].iloc[0],"mobile")

[('desktop', 1)]


False

### Compute "more desktop-y" user journeys

In [186]:
df_relc["MoreDesktops"] = df_relc['DeviceCategories'].map(lambda x: more_device(x,"desktop"))

In [188]:
df_relc.shape

(395777, 26)

In [193]:
df_relc[df_relc["MoreDesktops"]].num_rel_cont.describe()

count    213739.000000
mean          1.335053
std           0.794163
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          47.000000
Name: num_rel_cont, dtype: float64

In [190]:
(213739*100)/395777

54.004906803578784

###Â Describe `DesktopCount` metric

In [171]:
df_relc["DesktopCount"].describe()

count    395777.000000
mean          0.648827
std           5.524488
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max        2087.000000
Name: DesktopCount, dtype: float64

#### Distribution of desktop use % and average number of related links clicked

In [162]:
pd_num_rel = {}
for pd,num_rel in zip(df_relc['PercentDesktop'], df_relc['num_rel_cont']):
    if pd in pd_num_rel.keys():
        pd_num_rel[pd].append(num_rel)
    else:
        pd_num_rel[pd] = [num_rel]
        
for key,value in pd_num_rel.items():
    pd_num_rel[key] = round(sum(value)/len(value),3)

In [163]:
len(pd_num_rel)

438

In [164]:
for key in sorted([key for key in pd_num_rel.keys() if key <= 50],reverse=True)[0:20]:
    print(key,pd_num_rel[key])

50.0 1.084
49.438202247191015 1.0
49.056603773584904 1.0
48.38709677419355 1.0
48.148148148148145 1.0
47.916666666666664 1.0
47.82608695652174 1.0
47.61904761904762 1.0
47.36842105263158 1.0
46.666666666666664 1.143
46.42857142857143 1.0
46.34146341463415 1.0
46.26865671641791 1.0
46.15384615384615 1.0
45.714285714285715 1.0
45.669291338582674 1.0
45.6140350877193 1.0
45.45454545454545 1.083
45.16129032258065 1.0
45.0 1.0


In [167]:
for key,value in pd_num_rel.items():
    if value >= 2:
        print(key,value)
        

91.93548387096774 2.0
63.829787234042556 2.0
86.36363636363636 2.0
93.58974358974359 2.0
77.27272727272727 2.0
39.285714285714285 2.0
88.57142857142857 3.0
26.470588235294116 2.0


In [None]:
## Where are the max related links?