In [None]:
import os 
import pandas as pd
import numpy as np

In [None]:
pd.get_option('max_colwidth')

In [None]:
pd.set_option('max_colwidth',500)

## File/dir locations


In [None]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "preprocessed_taxon_pageseq_20190114_20190116.csv.gz"
# df_file = os.path.join(DATA_DIR, "processed_journey", filename)
# df_reduced_file = os.path.join(DATA_DIR, "processed_journey", "reduced_"+filename)
# df_rel_file = os.path.join(DATA_DIR, "processed_journey", "rel_"+filename)
df_doo_file = os.path.join(
    DATA_DIR, "processed_journey",
    "doo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz")

Load up a data file that isn't too large - we just want to check that the putative metrics for analysis can be derived from the data. This data was produced by an early version of the pipeline and is missing some descriptive variables, such as taxons etc. However, it contains the sequences of pages and behaviours (or events) of users on those pages, including interaction with the sidebar and the related links contained therein.

In [None]:
df = pd.read_csv(df_doo_file, compression="gzip")

In [None]:
df.head(2)

In [None]:
df.shape

## Planning

Inspecting our putative metrics for detecting a change in user navigation experience gives us three ratios. Thus we need to check that we can get the numerator and denominator from the data for each of these metrics.


## journey_click_rate
There is no difference in the proportion of journeys using at least one related link (journey_click_rate) between page variant A and page variant B.



\begin{equation*}
\frac{total number of journeys including at least one click on a related link}{total number of journeys}
\end{equation*}

### total number of journeys including at least one click on a related link
The numerator.

We need to check within the Sequence column, whether the corresponding user journey has an Event where a related link was clicked. There are more than one level to this Event, we are specifically interestd in "Related content" (as this is the sidebar of the page, the related links we are interested in).

In [None]:
#Compute whether a journey includes at least one related link click
def is_related(x):
    return all(cond in x for cond in ["relatedLinkClicked","Related content"])

In [None]:
# map across the Sequence variable, which includes pages and Events
# we want to pass all the list elements to a function one-by-one and then collect the output.
df["Has_Related"] = df["Sequence"].map(is_related)

In [None]:
# this gives a logical column
df["Has_Related"].head()

In [None]:
# We can filter for True and sum
df[df["Has_Related"]].Occurrences.sum()

In [None]:
# sense check by inspecting filtered df
df[df["Has_Related"]].head(3)

### total number of journeys
The denominator.

In [None]:
df.Occurrences.sum()

Given this sample, we see:

In [None]:
df[df["Has_Related"]].Occurrences.sum() / df.Occurrences.sum()

The above metric considers at least one related link clicked, but on some journeys more than one related link might get clicked. We can use the following columns to help us.

In [None]:
df['Event_cats_agg'].head(3)

In [None]:
df['Event_cat_act_agg'].head(3)

In [None]:
import ast

In [None]:
 ast.literal_eval(df['Event_List'][1])[0]

In [None]:
events= []

In [None]:
for items in df['Event_List']:
    for event in ast.literal_eval(items):
        events.append(event[0])

In [None]:
events

In [None]:
from collections import Counter

In [None]:
Counter(events)

Nav events eventCategory:
- breadcrumbClicked
- homeLinkClicked
- searchResults
- relatedLinkClicked (eventAction e.g. 1.1 Explore the topic)

Related link events eventCategory:
- relatedLinkClicked (eventAction e.g. 1.3 Related content)

### count of related links clicks pageviews

In [None]:
example_event_list = df['Event_List'][2034837]

In [None]:
example_event_list

In [None]:
related_clicks = 0
for event in ast.literal_eval(example_event_list):
    if all(cond in ''.join(event) for cond in ['relatedLinkClicked','Related content']):
        related_clicks += 1
        

In [None]:
sum([all(cond in ''.join(event) for cond in ['relatedLinkClicked','Related content']) for event in ast.literal_eval(example_event_list)])

In [None]:
related_clicks

In [None]:
def count_related_clicks(event_list):
    return sum(
        [all(
            cond in ''.join(event) for cond in [
                'relatedLinkClicked','Related content']) for event in ast.literal_eval(event_list)])

In [None]:
df['Related Clicks Count'] = df['Event_List'].map(count_related_clicks)

In [None]:
df

In [None]:
df['absolute related links count'] = df['Related Clicks Count'] * df['Occurrences']

In [None]:
df[df["Has_Related"]]

In [None]:
df['absolute related links count'].sum()

### total number of pageviews
Should these only include ones that HAD related links on them??

In [None]:
len(ast.literal_eval(df['Page_List'][2034757]))

In [None]:
def number_of_pageviews(page_list):
    return len(ast.literal_eval(page_list))

In [None]:
df['pageviews'] = df['Page_List'].map(number_of_pageviews)

In [None]:
df['absolute pageviews count'] = df['pageviews'] * df['Occurrences']

In [None]:
df