## Import

In [None]:
import os 
import pandas as pd
import numpy as np
import ast
import re

# z test
from statsmodels.stats.proportion import proportions_ztest

# bayesian bootstrap and vis
import matplotlib.pyplot as plt
import seaborn as sns
import bayesian_bootstrap.bootstrap as bb
from astropy.utils import NumpyRNGContext

# progress bar
from tqdm import tqdm, tqdm_notebook

from scipy import stats
from collections import Counter

# set up the style for our plots
sns.set(style='white', palette='colorblind', font_scale=1.3,
        rc={'figure.figsize':(12,9), 
            "axes.facecolor": (0, 0, 0, 0)})

# instantiate progress bar goodness
tqdm.pandas(tqdm_notebook)

pd.set_option('max_colwidth',500)

# the number of bootstrap means used to generate a distribution
boot_reps = 10000

# alpha
# alpha = 0.05
# n_tests = 4
# Correct alpha for multiple comparisons
# alpha = alpha / n_tests

# reproducible
seed = 1337

## File/dir locations
### Processed journey data

In [None]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "testing_processed_sampled_taxon_ab_2019-01-21.csv.gz"
filepath = os.path.join(
    DATA_DIR, "sampled_journey",
    filename)
filepath

In [None]:
# read in processed sampled journey with just the cols we need for related links
df = pd.read_csv(filepath, sep ="\t", compression="gzip")
# convert from str to list
df['Event_cat_act_agg']= df['Event_cat_act_agg'].progress_apply(ast.literal_eval)
df['Page_Event_List'] = df['Page_Event_List'].progress_apply(ast.literal_eval)
df['Page_List'] = df['Page_List'].progress_apply(ast.literal_eval)

In [None]:
df['Page_List_Length'] = df['Page_List'].progress_apply(len)


In [None]:
# drop dodgy rows, where page variant is not A or B. 
df = df.query('ABVariant in ["A", "B"]')

### Nav type of page lookup - is it a finding page? if not it's a thing page

In [None]:
filename = "document_types.csv.gz"

# created a metadata dir in the DATA_DIR to hold this data
filepath = os.path.join(
    DATA_DIR, "metadata",
    filename)
print(filepath)

df_finding_thing = pd.read_csv(filepath, sep="\t", compression="gzip")

df_finding_thing.head()

In [None]:
thing_page_paths = df_finding_thing[
    df_finding_thing['is_finding']==0]['pagePath'].tolist()


finding_page_paths = df_finding_thing[
    df_finding_thing['is_finding']==1]['pagePath'].tolist()

## Outliers
Some rows should be removed before analysis. For example rows with journey lengths of 500 or very high related link click rates. This process might have to happen once features have been created.

## journey_click_rate
There is no difference in the proportion of journeys using at least one related link (journey_click_rate) between page variant A and page variant B.



\begin{equation*}
\frac{\text{total number of journeys including at least one click on a related link}}{\text{total number of journeys}}
\end{equation*}

### Prepare features

#### Related link prep

In [None]:
def get_number_of_events_rl(event):
    """Counts events with category 'relatedLinkClicked' and action'Related content'."""
    if event[0][0] == 'relatedLinkClicked' and 'Related content' in event[0][1]:
        return event[1]
    return 0


def sum_related_click_events(event_list):
    return sum([get_number_of_events_rl(event) for event in event_list])


def is_related(x):
    """Compute whether a journey includes at least one related link click."""
    return x > 0

In [None]:
# get the number of related links clicks per Sequence
df['Related Links Clicks per seq'] = df['Event_cat_act_agg'].map(sum_related_click_events)

In [None]:
# map across the Sequence variable, which includes pages and Events
# we want to pass all the list elements to a function one-by-one and then collect the output.
df["Has_Related"] = df["Related Links Clicks per seq"].map(is_related)

df['Related Links Clicks row total'] = df['Related Links Clicks per seq'] * df['Occurrences']


In [None]:
df.head(3)

### Frequentist statistics

In [None]:
def z_prop(df, col_name):
    """
    Conduct z_prop test and generate confidence interval.

    Using Bernoulli trial terminology where X (or x)
    is number of successes and n is number of trials
    total occurrences, we compare ABVariant A and B.
    p is x/n. We use a z proportion test between variants.
    """
    # A & B
    n = df.Occurrences.sum()
    # prop of journeys with at least one related link, occurrences summed for those rows gives X
    p = df[df[col_name] == 1].Occurrences.sum() / n

    assert (p >= 0), "Prop less than zero!"
    assert (p <= 1), "Prop greater than one!"

    # A
    # number of trials for page A
    n_a = df[df.ABVariant == "A"].Occurrences.sum()
    # number of successes (occurrences), for page A and at least one related link clicked journeys
    x_a = df[(df['ABVariant'] == 'A') & (df[col_name] == 1)].Occurrences.sum()
    # prop of journeys where one related link was clicked, on A
    p_a = x_a / n_a

    # B
    # number of trials for page B
    n_b = df[df.ABVariant == "B"].Occurrences.sum()
    # number of successes for page B, at least one related link clicked
    x_b = df[(df['ABVariant'] == 'B') & (df[col_name] == 1)].Occurrences.sum()
    # prop of journeys where one related link was clicked, on B
    p_b = x_b / n_b

    assert (n == n_a + n_b), "Error in filtering by ABVariant!"

    # validate assumptions
    # The formula of z-statistic is valid only when sample size (n) is large enough.
    # nAp, nAq, nBp and nBq should be ≥ 5.
    # where p is probability of success (we can use current baseline)
    # q = 1 - p

    # tried a helper function here but it didn't work hence not DRY
    assert (n_a * p) >= 5, "Assumptions for z prop test invalid!"
    assert (n_a * (1 - p)) >= 5, "Assumptions for z prop test invalid!"

    assert (n_b * p) >= 5, "Assumptions for z prop test invalid!"
    assert (n_b * (1 - p)) >= 5, "Assumptions for z prop test invalid!"

    # using statsmodels
    # successes
    count = np.array([x_a, x_b])
    # number of trials
    nobs = np.array([n_a, n_b])
    # z prop test
    z, p_value = proportions_ztest(count, nobs, value=0, alternative='two-sided')
    # print(' z-stat = {z} \n p-value = {p_value}'.format(z=z,p_value=p_value))

    statsdict = {'metric_name': col_name, 'stats_method': 'z_prop_test',
                 'x_ab': x_a + x_b, 'n_ab': n, 'p': p,
                 'x_a': x_a, 'n_a': n_a, 'p_a': p_a,
                 'x_b': x_b, 'n_b': n_b, 'p_b': p_b,
                 'test_statistic': z, 'p-value': p_value}

    return statsdict

#### Statistical significance

In [None]:
# help(proportions_ztest)

In [None]:
has_rel = z_prop(df, 'Has_Related')
has_rel

#### Practical significance - uplift

In [None]:
# uplift
def compute_standard_error_prop_two_samples(x_a, n_a, x_b, n_b):
    """
    The standard error of the difference between two proportions is given by the square root of the variances.
    
    The square of the standard error of a proportion is known as the variance of proportion. 
    The variance of the difference between two independent proportions is equal to the sum of the variances of the proportions of each sample. 
    The variances are summed because each sample contributes to sampling error in the distribution of differences.
    
    """
    p1 = x_a/n_a
    p2 = x_b/n_b    
    se = p1*(1-p1)/n_a + p2*(1-p2)/n_b
    return np.sqrt(se)
    
def zconf_interval_two_samples(x_a, n_a, x_b, n_b, alpha=0.05):
    """
    Gives two points, the lower and upper bound of a (1-alpha)% confidence interval.
    
    To calculate the confidence interval we need to know the standard error of the difference between two proportions. 
    The standard error of the difference between two proportions is the combination of the standard error of two independent distributions, ES (p_a) and (p_b).
    
    If the CI includes zero then we accept the null hypothesis at the defined alpha.
    """
    p1 = x_a/n_a
    p2 = x_b/n_b    
    se = compute_standard_error_prop_two_samples(x_a, n_a, x_b, n_b)
    z_critical = stats.norm.ppf(1-0.5*alpha)
    return p2-p1-z_critical*se, p2-p1+z_critical*se


In [None]:
# Due to multiple testing we used the Bonferroni correction for alpha
ci_low,ci_upp = zconf_interval_two_samples(has_rel['x_a'], has_rel['n_a'],
                                           has_rel['x_b'], has_rel['n_b'], alpha = 0.01)
print(' 95% Confidence Interval = ( {0:.2f}% , {1:.2f}% )'
      .format(100*ci_low, 100*ci_upp))

### Bayesian statistics 

In [None]:
# https://medium.com/@thibalbo/coding-bayesian-ab-tests-in-python-e89356b3f4bd

To be developed, a Bayesian approach can provide a simpler interpretation.

## ratio of clicks on navigation elements vs. clicks on related links

There is no statistically significant difference in the count of clicks on navigation elements per journey between page variant A and page variant B.

\begin{equation*}
{\text{total number of navigation element click events from content pages}}
\end{equation*}

### Related link counts

In [None]:
# get the total number of related links clicks for that row (clicks per sequence multiplied by occurrences)
df['Related Links Clicks row total'] = df['Related Links Clicks per seq'] * df['Occurrences']

### Navigation events

In [None]:
def is_nav_event(event):
    """
    Return the total number of related links clicks for that row.
    
    Clicks per sequence multiplied by occurrences. 
    """
    return any(
        ['breadcrumbClicked' in event, 'homeLinkClicked' in event,
         all(cond in event for cond in [
             'relatedLinkClicked','Explore the topic'])])


def count_nav_events(page_event_list):
    """Counts the number of nav events from a content page in a Page Event List."""
    content_page_nav_events = 0
    for pair in page_event_list:
        if is_nav_event(pair[1]):
            if pair[0] in thing_page_paths:
                content_page_nav_events += 1
    return content_page_nav_events

In [None]:
# needs finding_thing_df read in from document_types.csv.gz
df['Content_Page_Nav_Event_Count'] = df['Page_Event_List'].progress_map(count_nav_events)

In [None]:
def count_search_from_content(page_list):
    search_from_content = 0
    for i, page in enumerate(page_list):
        if i > 0:
            if '/search?q=' in page:
                if page_list[i-1] in thing_page_paths:
                    search_from_content += 1
    return search_from_content

In [None]:
df['Content_Search_Event_Count'] = df['Page_List'].progress_map(count_search_from_content)

In [None]:
df['Content_Nav_or_Search_Count'] = df['Content_Page_Nav_Event_Count'] + df['Content_Search_Event_Count']
df['Content_Nav_Search_Event_Sum_row_total'] = df['Content_Nav_or_Search_Count'] * df['Occurrences']
# required for journeys with no nav
df['Has_No_Nav_Or_Search'] = df['Content_Nav_Search_Event_Sum_row_total'] == 0

### Generate the derived metric

In [None]:
# (nav events + search events) + 1 / related links clicked + 1
# add one to numerator and denominator to avoid undesirable characteristics
# not sure this has great utility as a proxy, seems volatile
df['Ratio_Nav_Search_to_Rel'] = (df['Content_Nav_Search_Event_Sum_row_total'] + 1) / (df['Related Links Clicks row total'] + 1)
sns.distplot(df['Ratio_Nav_Search_to_Rel'].values);

This derived variable is problematic, should consider dropping it. Use counts of the numerator instead (as this could be modelled using generalised linear model), as related link clickedness is captured by the earlier metric.

## Temporary df file in case of crash
### Save

In [None]:
# create temp file incase bootstrap below crashes
filepath = os.path.join(
    DATA_DIR, "rl_sampled_processed_journey",
    filename)



In [None]:
# df.to_csv(filepath, sep="\t", compression="gzip", index=False)

We do the above as the Bayesian boostrap is computationally intensive...

### Bayesian bootstrap

In [None]:
def mean_bb(counter_X_keys, counter_X_vals, n_replications):
    """Simulate the posterior distribution of the mean.
    Parameter X: The observed data (array like)
    Parameter n_replications: The number of bootstrap replications to perform (positive integer)
    Returns: Samples from the posterior
    """
    samples = []
    weights = np.random.dirichlet(counter_X_vals, n_replications)
    for w in weights:
        samples.append(np.dot(counter_X_keys, w))
    return samples

In [None]:
def bayesian_bootstrap_analysis(df, col_name=None, boot_reps = 10000, seed = 1337):
    """Run bayesian bootstrap on the mean of a variable of interest between Page Variants.
    
    Args:
        df: A rl_sampled_processed pandas Datframe.
        col_name: A string of the column of interest.

    Returns:
        a_bootstrap: a vector of boot_reps n resampled means from A.
        b_bootstrap: a vector of boot_reps n resampled means from B.
        """
    with NumpyRNGContext(seed):
        A_grouped_by_length =  df[df.ABVariant == "A"].groupby(
            col_name).sum().reset_index()
        B_grouped_by_length =  df[df.ABVariant == "B"].groupby(
            col_name).sum().reset_index()
        a_bootstrap = mean_bb(A_grouped_by_length[col_name], 
        A_grouped_by_length['Occurrences'], 
                                 boot_reps)
        b_bootstrap = mean_bb(B_grouped_by_length[col_name], 
                                 B_grouped_by_length['Occurrences'], 
                                 boot_reps)
    
    return a_bootstrap, b_bootstrap

In [None]:
a_bootstrap, b_bootstrap = bayesian_bootstrap_analysis(df, col_name='Ratio_Nav_Search_to_Rel', boot_reps=boot_reps, seed = seed)

In [None]:
def bb_hdi(a_bootstrap, b_bootstrap, alpha = 0.05):
    """Calculate a 1-alpha high density interval
    
    Args:
        a_bootstrap: a list of resampled means from page A journeys.
        b_bootstrap: a list of resampled means from page B journeys.

    Returns:
        a_ci_low: the lower point of the 1-alpha% highest density interval for A.
        a_ci_hi: the higher point of the 1-alpha% highest density interval for A.
        b_ci_low: the lower point of the 1-alpha% highest density interval for B.
        b_ci_hi: the higher point of the 1-alpha% highest density interval for B.
        ypa_diff_mean: the mean difference for the posterior between A's and B's distributions.
        ypa_diff_ci_low: lower hdi for posterior of the difference.
        ypa_diff_ci_hi: upper hdi for posterior of the difference.
        sorta_p_value: number of values greater than 0 divided by num of obs for mean diff psoterior.
        """
    # Calculate a 95% HDI
    a_ci_low, a_ci_hi = bb.highest_density_interval(a_bootstrap, alpha=alpha)
    # Calculate a 95% HDI
    b_ci_low, b_ci_hi = bb.highest_density_interval(b_bootstrap, alpha=alpha)
    
    # calculate the posterior for the difference between A's and B's mean of resampled means
    # ypa prefix is vestigial from blog post
    ypa_diff = np.array(b_bootstrap) - np.array(a_bootstrap)
    ypa_diff_mean = ypa_diff.mean()
    # get the hdi
    ypa_diff_ci_low, ypa_diff_ci_hi = bb.highest_density_interval(ypa_diff)
    # We count the number of values greater than 0 and divide by the total number
    # of observations
    # which returns us the the proportion of values in the distribution that are
    # greater than 0, could act a bit like a p-value
    p_value = (ypa_diff > 0).sum() / ypa_diff.shape[0]

    return {'a_ci_low':a_ci_low, 'a_ci_hi':a_ci_hi, 'b_ci_low':b_ci_low, 'b_ci_hi':b_ci_hi, 'ypa_diff_mean':ypa_diff_mean, 'ypa_diff_ci_low':ypa_diff_ci_low, 'ypa_diff_ci_hi':ypa_diff_ci_hi, 'p_value':p_value}

In [None]:
ratio_stats = bb_hdi(a_bootstrap, b_bootstrap)
ratio_stats

In [None]:
ratio_stats

In [None]:
ax = sns.distplot(b_bootstrap, label='B')
ax.errorbar(x=[ratio_stats['b_ci_low'], ratio_stats['b_ci_hi']], y=[2, 2], linewidth=5, c='teal', marker='o', 
         label='95% HDI B')

ax = sns.distplot(a_bootstrap, label='A', ax=ax, color='salmon')
ax.errorbar(x=[ratio_stats['a_ci_low'], ratio_stats['a_ci_hi']], y=[5, 5], linewidth=5, c='salmon', marker='o', 
         label='95% HDI A')

ax.set(xlabel='Ratio of clicks on nav to clicks on related links', ylabel='Density')
sns.despine()
legend = plt.legend(frameon=True)
frame = legend.get_frame()
frame.set_facecolor('white')
plt.show();

In [None]:
# calculate the posterior for the difference between A's and B's ratio
# ypa prefix is vestigial from blog post
ypa_diff = np.array(b_bootstrap) - np.array(a_bootstrap)
# get the hdi
ypa_diff_ci_low, ypa_diff_ci_hi = bb.highest_density_interval(ypa_diff)

# the mean of the posterior
print('mean:', ypa_diff.mean())

print('low ci:', ypa_diff_ci_low, '\nhigh ci:', ypa_diff_ci_hi)

In [None]:
ax = sns.distplot(ypa_diff)
ax.plot([ypa_diff_ci_low, ypa_diff_ci_hi], [0, 0], linewidth=10, c='k', marker='o', 
         label='95% HDI')
ax.set(xlabel='Ratio of nav events to related link events', ylabel='Density', 
       title='The difference between B\'s and A\'s mean ratio')
sns.despine()
legend = plt.legend(frameon=True)
frame = legend.get_frame()
frame.set_facecolor('white')
plt.show();

In [None]:
# We count the number of values greater than 0 and divide by the total number
# of observations
# which returns us the the proportion of values in the distribution that are
# greater than 0, could act a bit like a p-value
(ypa_diff > 0).sum() / ypa_diff.shape[0]

## proportion of journeys with a page sequence including content and related links only

There is no statistically significant difference in the proportion of journeys with a page sequence including content and related links only (including loops) between page variant A and page variant B

\begin{equation*}
\frac{\text{total number of journeys that only contain content pages and related links (i.e. no nav pages)}}{\text{total number of journeys}}
\end{equation*}

### Overall

In [None]:
# if (Content_Nav_Search_Event_Sum == 0) that's our success
# Has_No_Nav_Or_Search == 1 is a success
# the problem is symmetrical so doesn't matter too much
sum(df.Has_No_Nav_Or_Search * df.Occurrences) / df.Occurrences.sum()

In [None]:
sns.distplot(df.Content_Nav_or_Search_Count.values);

### Frequentist statistics
#### Statistical significance

In [None]:

nav = z_prop(df, 'Has_No_Nav_Or_Search')
nav

#### Practical significance - uplift

In [None]:
# function defined earlier in notebook

# Due to multiple testing we used the Bonferroni correction for alpha
ci_low,ci_upp = zconf_interval_two_samples(nav['x_a'], nav['n_a'],
                                           nav['x_b'], nav['n_b'], alpha = 0.01)
print(' 95% Confidence Interval = ( {0:.2f}% , {1:.2f}% )'
      .format(100*ci_low, 100*ci_upp))

## Average Journey Length (number of page views)
There is no statistically significant difference in the average page list length of journeys (including loops) between page variant A and page variant B.
### Bayesian bootstrap for non-parametric hypotheses

In [None]:
# http://savvastjortjoglou.com/nfl-bayesian-bootstrap.html

In [None]:
# let's use mean journey length (could probably model parametrically but we use it for demonstration here)
# some journeys have length 500 and should probably be removed as they are liekely bots or other weirdness

In [None]:
# need to roll out the data, deaggregate on one variable of interest
# we want to repeat each row's journey length by it's occurrences
# so more common journey lengths are more likely to be sampled
print(df['Page_List_Length'].head())
print(df['Occurrences'].head())

np.repeat(df['Page_List_Length'].head(), df['Occurrences'].head())


In [None]:
a_len = np.repeat(df.loc[df.ABVariant == "A", 'Page_List_Length'], df.loc[df.ABVariant == "A", "Occurrences"])
a_len.values

b_len = np.repeat(df.loc[df.ABVariant == "B", 'Page_List_Length'], df.loc[df.ABVariant == "B", "Occurrences"])
b_len.values

In [None]:
help(bb.mean)

In [None]:
# for reproducibility, set the seed within this context
with NumpyRNGContext(1337):
    a_bootstrap = bb.mean(a_len.values, n_replications=boot_reps)
    b_bootstrap = bb.mean(b_len.values, n_replications=boot_reps)

In [None]:
ax = sns.distplot(a_bootstrap, color='salmon')
ax.set(xlabel='Journey Length', ylabel='Density', title='Page Variant A Mean Journey Length')
sns.despine();

In [None]:
# Calculate a 95% HDI
a_ci_low, a_ci_hi = bb.highest_density_interval(a_bootstrap)
print('low ci:', a_ci_low, '\nhigh ci:', a_ci_hi)

In [None]:
ax = sns.distplot(a_bootstrap, color='salmon')
ax.plot([a_ci_low, a_ci_hi], [0, 0], linewidth=10, c='k', marker='o', 
         label='95% HDI')
ax.set(xlabel='Journey Length', ylabel='Density', title='Page Variant A Mean Journey Length')
sns.despine()
plt.legend();

In [None]:
# Calculate a 95% HDI
b_ci_low, b_ci_hi = bb.highest_density_interval(b_bootstrap)
print('low ci:', b_ci_low, '\nhigh ci:', b_ci_hi)

In [None]:
ax = sns.distplot(b_bootstrap)
ax.plot([b_ci_low, b_ci_hi], [0, 0], linewidth=10, c='k', marker='o', 
         label='95% HDI')
ax.set(xlabel='Journey Length', ylabel='Density', title='Page Variant B Mean Journey Length')
sns.despine()
legend = plt.legend(frameon=True)
frame = legend.get_frame()
frame.set_facecolor('white')
plt.show();

In [None]:
ax = sns.distplot(b_bootstrap, label='B')
ax = sns.distplot(a_bootstrap, label='A', ax=ax, color='salmon')
ax.set(xlabel='Journey Length', ylabel='Density')
sns.despine()
legend = plt.legend(frameon=True)
frame = legend.get_frame()
frame.set_facecolor('white')
plt.show();

We can also measure the uncertainty in the difference between the Page Variants's Journey Length by subtracting their posteriors.



In [None]:
# calculate the posterior for the difference between A's and B's YPA
ypa_diff = np.array(b_bootstrap) - np.array(a_bootstrap)
# get the hdi
ypa_diff_ci_low, ypa_diff_ci_hi = bb.highest_density_interval(ypa_diff)

In [None]:
# the mean of the posterior
ypa_diff.mean()

In [None]:
print('low ci:', ypa_diff_ci_low, '\nhigh ci:', ypa_diff_ci_hi)


In [None]:
ax = sns.distplot(ypa_diff)
ax.plot([ypa_diff_ci_low, ypa_diff_ci_hi], [0, 0], linewidth=10, c='k', marker='o', 
         label='95% HDI')
ax.set(xlabel='Journey Length', ylabel='Density', 
       title='The difference between B\'s and A\'s mean Journey Length')
sns.despine()
legend = plt.legend(frameon=True)
frame = legend.get_frame()
frame.set_facecolor('white')
plt.show();

We can actually calculate the probability that B's mean Journey Length was greater than A's mean Journey Length by measuring the proportion of values greater than 0 in the above distribution.

In [None]:
# We count the number of values greater than 0 and divide by the total number
# of observations
# which returns us the the proportion of values in the distribution that are
# greater than 0, could act a bit like a p-value
(ypa_diff > 0).sum() / ypa_diff.shape[0]