In [1]:
import os 
import pandas as pd
import numpy as np
from ast import literal_eval
from collections import Counter
from operator import itemgetter

### File/dir locations

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "preprocessed_taxon_pageseq_20190114_20190116.csv.gz"
df_file = os.path.join(DATA_DIR, "processed_journey", filename)
df_rel_file = os.path.join(DATA_DIR, "processed_journey", "rel_"+filename)

### Load up unfiltered data: 14-16/01/19

In [6]:
df = pd.read_csv(df_file, sep="\t", compression = "gzip")
df.shape

(5048130, 17)

In [7]:
df.columns

Index(['Occurrences', 'DeviceCategories', 'PageSeq_Length', 'Actions_Length',
       'Dates', 'Sequence', 'PageSequence', 'Page_Event_List', 'Page_List',
       'Event_List', 'num_event_cats', 'Event_cats_agg', 'Event_cat_act_agg',
       'Taxon_List', 'Taxon_Page_List', 'Page_List_NL', 'Page_Seq_NL'],
      dtype='object')

Original dataset shape (5048130, 17)

In [8]:
df.drop(['Dates','Page_List', 'Event_List', 'Taxon_List', 'Taxon_Page_List', 'Page_List_NL'],axis=1,inplace=True)

In [9]:
df.to_csv("reduced_"+filename, compression="gzip", index=False)

In [10]:
## Total volume of traffic
df.Occurrences.sum()

15053255

In [13]:
def column_eval(cols):
    for column in cols:
        if not isinstance(df[column].iloc[0],list):
            print(column)
            df[column] = df[column].map(literal_eval)

### Out of all journeys, what is the % (volume, sum of Occurrences) of journeys including "related content" link clicks?
From previous analysis, I want to say roughly 10%, but let's verify that

In [18]:
df[(df.Sequence.str.contains("Related content")) & (df.Sequence.str.contains("relatedLinkClicked"))].shape

(395777, 11)

In [None]:
# df_relc = df_rel[df_rel.Sequence.str.contains("Related content")].copy(deep=True)
## Sum of occurrences of journeys including "related content" link clicks (excluding "explore this topic ones")
# relc_shape = df_relc.shape[0] = 395777

In [19]:
round((473975*100)/df.Occurrences.sum(),2)

3.15

In [21]:
columns = ['Page_Event_List', 'DeviceCategories', 'Event_cats_agg', 'Event_cat_act_agg']
column_eval(columns)

Page_Event_List
DeviceCategories
Event_cats_agg
Event_cat_act_agg


Run this at some point to compare against "related content" link-specific dataset. 
There seems to be an indication that there are no major differences

In [None]:
# og_df_devices = Counter()
# for item in df.DeviceCategories.values:
#     for key,value in item:
#         og_df_devices[key]+=value

### Out of all journeys coming from mobiles vs desktops, what is the % (volume, sum of Occurrences) of journeys including "related content" link clicks?

In [14]:
def more_device(x,device):
    return max(x,key=itemgetter(1))[0]==device 
def device_count(x,device):
    return sum([value for item, value in x if item==device])      

In [22]:
df["DesktopCount"] = df['DeviceCategories'].map(lambda x: device_count(x,"desktop"))
df["MobileCount"] = df['DeviceCategories'].map(lambda x: device_count(x,"mobile"))        

In [23]:
df["DesktopCount"].describe()

count    5.048130e+06
mean     1.544602e+00
std      1.329323e+02
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.486640e+05
Name: DesktopCount, dtype: float64

### Traffic volume

In [None]:
### Test implementation

In [29]:
related = df[(df.Sequence.str.contains("Related content")) & (df.Sequence.str.contains("relatedLinkClicked"))].iloc[0]

In [37]:
explore = df[~(df.Sequence.str.contains("Related content")) & (df.Sequence.str.contains("relatedLinkClicked"))].iloc[0]

In [30]:
related

Occurrences                                                          1
DeviceCategories                                        [(desktop, 1)]
PageSeq_Length                                                      38
Actions_Length                                                      73
Sequence             /browse/working/state-pension<<PAGE<:<NULL<:<N...
PageSequence         /browse/working/state-pension>>/state-pension-...
Page_Event_List      [(/browse/working/state-pension, PAGE<:<NULL<:...
num_event_cats                                                      12
Event_cats_agg       [(PAGE_NULL, 38), (secondLevelBrowseLinkClicke...
Event_cat_act_agg    [((PAGE_NULL, PAGE_NULL), 38), ((secondLevelBr...
Page_Seq_NL          /browse/working/state-pension>>/state-pension-...
DesktopCount                                                         1
MobileCount                                                          0
Name: 4, dtype: object

In [38]:
explore

Occurrences                                                          1
DeviceCategories                                        [(desktop, 1)]
PageSeq_Length                                                      19
Actions_Length                                                      36
Sequence             /limited-company-formation<<PAGE<:<NULL<:<NULL...
PageSequence         /limited-company-formation>>/government/public...
Page_Event_List      [(/limited-company-formation, PAGE<:<NULL<:<NU...
num_event_cats                                                       7
Event_cats_agg       [(PAGE_NULL, 19), (user_satisfaction_survey, 2...
Event_cat_act_agg    [((PAGE_NULL, PAGE_NULL), 19), ((user_satisfac...
Page_Seq_NL          /limited-company-formation>>/government/public...
DesktopCount                                                         1
MobileCount                                                          0
Name: 1, dtype: object

In [40]:
explore.Event_cat_act_agg

[(('PAGE_NULL', 'PAGE_NULL'), 19),
 (('user_satisfaction_survey', 'banner_shown'), 2),
 (('External Link Clicked', 'http://resources.companieshouse.gov.uk/sic'), 3),
 (('Download Link Clicked',
   '/government/uploads/system/uploads/attachment_data/file/527619/SIC07_CH_condensed_list_en.csv/preview'),
  1),
 (('contentsClicked', 'content_item 3'), 1),
 (('contentsClicked', 'previous'), 1),
 (('contentsClicked', 'next'), 4),
 (('contentsClicked', 'content_item 4'), 1),
 (('relatedLinkClicked', '1.1 Collection'), 1),
 (('navDocumentCollectionLinkClicked', '2.1'), 1),
 (('Download Link Clicked',
   '/government/uploads/system/uploads/attachment_data/file/298358/pro_forma_of_ca_2006_memorandum_for_a_company_without_a_share_capital.pdf'),
  1),
 (('External Link Clicked',
   'https://www.tax.service.gov.uk/register-your-company/setting-up-new-limited-company'),
  1)]

In [None]:
#### Function to compute whether a journey includes a related link click

In [41]:
def is_related(x):
    return all(cond in x for cond in ["relatedLinkClicked","Related content"])

In [42]:
print(is_related(related.Sequence))
print(is_related(df.Sequence.iloc[0]))
print(is_related(explore))

True
False
False


In [43]:
df["Has_Related"] = df["Sequence"].map(is_related)

In [45]:
df[df["Has_Related"]].Occurrences.sum()

473975

### Journeys per device
Counter({'desktop': 256791, 'tablet': 55546, 'mobile': 161638})

In [56]:
## Number of journeys coming from desktops
vol_desk = df["DesktopCount"].sum()

In [55]:
## Number of journeys coming from mobiles
vol_mobile = df["MobileCount"].sum()

In [46]:
desktop_journeys = df[df.DesktopCount>0].copy(deep=True)
mobile_journeys = df[df.MobileCount>0].copy(deep=True)

In [53]:
vol_desk_rel = desktop_journeys[desktop_journeys.Has_Related].DesktopCount.sum()

In [54]:
vol_mobile_rel = mobile_journeys[mobile_journeys.Has_Related].MobileCount.sum()

In [59]:
round((vol_desk_rel*100)/vol_desk,2)

3.29

In [58]:
round((vol_mobile_rel*100)/vol_mobile,2)

2.67