In [None]:
import os 
import pandas as pd
import numpy as np
import ast
import re
from statsmodels.stats.proportion import proportions_ztest

from scipy import stats
from collections import Counter

## File/dir locations


In [None]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "testing_processed_sampled_taxon_ab_2019-01-21.csv.gz"
filepath = os.path.join(
    DATA_DIR, "processed_journey",
    filename)
filepath

In [None]:
# read in processed sampled journey with just the cols we need for related links
df = pd.read_csv(filepath, sep ="\t", compression="gzip")
# convert from str to list
df['Event_cat_act_agg']= df['Event_cat_act_agg'].apply(ast.literal_eval)
# drop non AB page variants


In [None]:
# drop dodgy rows, where page variant is not A or B. 
df = df.query('ABVariant in ["A", "B"]')

## Outliers
Some rows should be removed before analysis. For example rows with journey lengths of 500 or very high related link click rates.

## journey_click_rate
There is no difference in the proportion of journeys using at least one related link (journey_click_rate) between page variant A and page variant B.



\begin{equation*}
\frac{\text{total number of journeys including at least one click on a related link}}{\text{total number of journeys}}
\end{equation*}

### Prepare features

In [None]:
def get_number_of_events_rl(event):
    """Counts events with category 'relatedLinkClicked' and action'Related content'."""
    if event[0][0] == 'relatedLinkClicked' and 'Related content' in event[0][1]:
        return event[1]
    return 0


def sum_related_click_events(event_list):
    return sum([get_number_of_events_rl(event) for event in event_list])


def is_related(x):
    """Compute whether a journey includes at least one related link click."""
    return x > 0

In [None]:
# get the number of related links clicks per Sequence
df['Related Links Clicks per seq'] = df['Event_cat_act_agg'].map(sum_related_click_events)

In [None]:
# map across the Sequence variable, which includes pages and Events
# we want to pass all the list elements to a function one-by-one and then collect the output.
df["Has_Related"] = df["Related Links Clicks per seq"].map(is_related)

In [None]:
n = df.Occurrences.sum()
# prop of journeys with at least one related link
p = df.Has_Related.sum() / n

In [None]:
p

In [None]:
# using Bernoulli trial terminology
# total occurrences, both A and B
# assume non- A and B were dropped
n = df.Occurrences.sum()
# prop of journeys with at least one related link
p = df.Has_Related.sum() / n

assert (p >= 0),"Prop less than zero!"
assert (p <= 1),"Prop greater than one!"


# number of trials for page A
n_a = df[df.ABVariant == "A"].Occurrences.sum()
# number of successes for page A, at least one related link clicked
x_a = df[df.ABVariant == "A"].Has_Related.sum()
# prop of journeys where one related link was clicked, on A
p_a = x_a / n_a


# number of trials for page B
n_b = df[df.ABVariant == "B"].Occurrences.sum()
# number of successes for page B, at least one related link clicked
x_b = df[df.ABVariant == "B"].Has_Related.sum()
# prop of journeys where one related link was clicked, on B
p_b = x_b / n_b

assert (n == n_a + n_b), "Error in filtering by ABVariant!"

### Frequentist statistics

#### Statistical significance

In [None]:
# help(proportions_ztest)

In [None]:
# using statsmodels
# successes
count = np.array([x_a, x_b])
# number of trials
nobs = np.array([n_a, n_b])
# z prop test
z,p_value = proportions_ztest(count, nobs, value=0, alternative='two-sided')
print(' z-stat = {z} \n p-value = {p_value}'.format(z=z,p_value=p_value))

#### Practical significance - uplift

In [None]:
# uplift
def compute_standard_error_prop_two_samples(x_a, n_a, x_b, n_b, alpha=0.05):
    p1 = x_a/n_a
    p2 = x_b/n_b    
    se = p1*(1-p1)/n_a + p2*(1-p2)/n_b
    return np.sqrt(se)
    
def zconf_interval_two_samples(x_a, n_a, x_b, n_b, alpha=0.05):
    p1 = x_a/n_a
    p2 = x_b/n_b    
    se = compute_standard_error_prop_two_samples(x_a, n_a, x_b, n_b)
    z_critical = stats.norm.ppf(1-0.5*alpha)
    return p2-p1-z_critical*se, p2-p1+z_critical*se




In [None]:
# Due to multiple testing we used the Bonferroni correction for alpha
ci_low,ci_upp = zconf_interval_two_samples(x_a, n_a,
                                           x_b, n_b, alpha = 0.01)
print(' 95% Confidence Interval = ( {0:.2f}% , {1:.2f}% )'
      .format(100*ci_low, 100*ci_upp))

### Bayesian statistics 

In [None]:
# create vector where 1 is success and 0 is failure
control   = [1]*x_a + [0]*(n_a - x_a)
treatment = [1]*x_b + [0]*(n_b - x_b)
control = np.asarray(control)
treatment = np.asarray(treatment)

start = {}

start['p_C'] = (control).sum()/len(control)
start['p_T'] = (treatment).sum()/len(treatment)

To be developed, a Bayesian approach can provide a simpler interpretation.

## ratio of clicks on navigation elements vs. clicks on related links

There is no statistically significant difference in the ratio of clicks on navigation elements vs. clicks on related links between page variant A and page variant B

\begin{equation*}
\frac{\text{total number of navigation element click events from content pages}}{\text{total number of related link click events}}
\end{equation*}