In [2]:
import os 
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

from collections import Counter

In [3]:
%matplotlib inline

In [4]:
# Some of the columns we will look at can be quite wide, but it's good to get an idea of what they contain
print(pd.get_option('max_colwidth'))
pd.set_option('max_colwidth',500)

50


## File/dir locations


In [5]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "preprocessed_taxon_pageseq_20190114_20190116.csv.gz"
# df_file = os.path.join(DATA_DIR, "processed_journey", filename)
# df_reduced_file = os.path.join(DATA_DIR, "processed_journey", "reduced_"+filename)
# df_rel_file = os.path.join(DATA_DIR, "processed_journey", "rel_"+filename)
# df_doo_file = os.path.join(
#     DATA_DIR, "processed_journey",
#     "doo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz")

df_dlo_file = os.path.join(
    DATA_DIR, "processed_journey",
    "dlo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz")
df_kloo_file = os.path.join(
    DATA_DIR, "processed_journey",
    "kloo_prelim_meta_standard_with_pageseq_from_29-10_to_04-11-2018.csv.gz")

In [6]:
#the 'drop length one' data read into pandas dataframe
dlo = pd.read_csv(df_dlo_file, compression='gzip')
#the 'keep length one only' data read into pandas dataframe
kloo = pd.read_csv(df_kloo_file, compression='gzip')

In [7]:
dlo.shape

(3788851, 15)

In [8]:
kloo.shape

(890977, 15)

Load up a data from two files: dlo = drop length one journeys, kloo = keep length one journeys only 

This data was produced by an early version of the pipeline and is missing some descriptive variables, such as taxons etc. However, it contains the sequences of pages and behaviours (or events) of users on those pages, including interaction with the sidebar and the related links contained therein.

In [None]:
#get a reproducible sample of 20% of journey types from each dataframe, 
#sampled in proportion to the number of occurrences of each journey type
#then join the new samples together into a single dataframe

# df = pd.concat([dlo.sample(frac=0.2, random_state=1234, weights=dlo.Occurrences).copy(), kloo.sample(frac=0.2, random_state=1234, weights=kloo.Occurrences).copy()], ignore_index=True)


# try sampling with replacement, using occurrences as weights, but then 
# change all "occurrences" to 1, to try to create a more representative sample?
# df = pd.concat([
#     dlo.sample(
#         frac=0.4, random_state=1234, weights=dlo.Occurrences, replace=True
#     ).copy(),
#     kloo.sample(
#          frac=0.4, random_state=1234, weights=kloo.Occurrences, replace=True
#     ).copy()],
#     ignore_index=True)

# try  concatting and THEN sampling with replacement, using occurrences as
# weights, but then change all "occurrences" to 1, to try to create a more 
# representative sample?
df = pd.concat([
    dlo.copy(),
    kloo.copy()],
    ignore_index=True)
df = df.sample(
        frac=0.4, random_state=1234, weights=df.Occurrences, replace=True
    )

# # try  concatting and THEN sampling without replacement, using occurrences as
# # weights
# df = pd.concat([
#     dlo.copy(),
#     kloo.copy()],
#     ignore_index=True)
# df = df.sample(
#         frac=0.4, random_state=1234, weights=df.Occurrences
#     )

In [None]:
df.shape

## Remove tablet occurrences

In [154]:
def device_count(x, device):
    return sum([value for item, value in x if item == device])
df["TabletCount"] = df['DeviceCategories'].apply(
    ast.literal_eval).map(lambda x: device_count(x, "tablet"))
df["Occurrences"] = df["Occurrences"] - df["TabletCount"]

In [None]:
df = df[df["Occurrences"] != 0]
df.shape

In [117]:
# MAKE EACH OCCURRENCES 1
# df['Occurrences'] = 1

## journey_click_rate
There is no difference in the proportion of journeys using at least one related link (journey_click_rate) between page variant A and page variant B.



\begin{equation*}
\frac{\text{total number of journeys including at least one click on a related link}}{\text{total number of journeys}}
\end{equation*}

### total number of journeys including at least one click on a related link
The numerator.

We need to check within the Sequence column, whether the corresponding user journey has an Event where a related link was clicked. There is more than one level to this Event, we are specifically interested in "Related content" (as this is the sidebar of the page, the related links we are interested in).

In [149]:
#Compute whether a journey includes at least one related link click
def is_related(x):
    return all(cond in x for cond in ["relatedLinkClicked","Related content"])

Please note, `is_related` does not make sure that `relatedLinkClicked` and `Related content` exist in the same event in `Sequence`

In [150]:
# map across the Sequence variable, which includes pages and Events
# we want to pass all the list elements to a function one-by-one and then collect the output.
df["Has_Related"] = df["Sequence"].map(is_related)

In [151]:
# We can filter for True and sum
df[df["Has_Related"]].Occurrences.sum()

-5977143

### total number of journeys
The denominator.

In [123]:
df.Occurrences.sum()

1820703

### final metric

Given this sample, we see:

In [124]:
df[df["Has_Related"]].Occurrences.sum() / df.Occurrences.sum()

0.029691278588545193

## ratio of clicks on navigation elements vs. clicks on related links

There is no statistically significant difference in the ratio of clicks on navigation elements vs. clicks on related links between page variant A and page variant B

\begin{equation*}
\frac{\text{total number of navigation element click events from content pages}}{\text{total number of related link click events}}
\end{equation*}

### total number of related link click events

we need to check `Related content` is in the event, because the `relatedLinkClicked` is also used for "explore the topic" links at the bottom of the page, with the event action containing `Explore the topic`, e.g. `(('relatedLinkClicked', '2.1 Explore the topic'), 1)`

In [127]:
# If the event category is 'relatedLinkClicked' and the event action contains 'Related content', 
# return the count of that event
def get_number_of_events_rl(event):
    if event[0][0] == 'relatedLinkClicked' and 'Related content' in event[0][1]:
        return event[1]
    return 0

def sum_related_click_events(event_list):
    return sum([get_number_of_events_rl(event) for event in event_list])

In [129]:
# get the number of related links clicks per Sequence
df['Related Links Clicks per seq'] = df['Event_cat_act_agg'].apply(
    ast.literal_eval).map(sum_related_click_events)

# get the total number of related links clicks for that row (clicks per sequence multiplied by occurrences)
df['Related Links Clicks row total'] = df['Related Links Clicks per seq'] * df['Occurrences']

In [131]:
df['Related Links Clicks row total'].sum()

68400