In [None]:
import os 
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import re
import tqdm
import seaborn as sns

from collections import Counter

# progress bar
from tqdm import tqdm, tqdm_notebook
# instantiate progress bar goodness
tqdm.pandas(tqdm_notebook)

# print long str
pd.set_option('max_colwidth',500)

In [None]:
page_of_interest = "/help/cookies"

# The problem
We are interested in inspecting user journeys to determine whether a specific page is disruptive to those journeys, let's call it Page X. We use some pseudocode art to describe what we mean informerly:

**Succesful or undisrupted journey**  
A -> X -> A

**Unsuccesful or disrupted journey**    
A -> X -> NOT A  
A -> X -> Exit

Where `A` is a node in a journey immediately prior to `X` (`A` can be any page except `X`). A succesful or not disrupted journey, will be looped, that is a user will travel to `X` from `A`, then return back to `A` again, continuing their journey undisrupted. An unsuccesful journey or disrupted journey is considered to be anything else, including the user leaving the site.

# Using this notebook
This notebook is written for those new to Python. Accordingly, we don't always use the most Pythonic or efficient code. Instead we opt for code that is most explicit and easy to follow with lots of examples.

# File/dir locations


We use a recent processed_journey dataset derived from using this repo.

In [None]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "full_sample_taxon_ab_2019_947858.csv.gz"
df_file = os.path.join(
    DATA_DIR, "processed_journey",
    filename)

print(df_file)

Load up a data file that isn't too large. Here we use a sampled dataset from a week's worth of data of the 21-27 Jan 2019. It includes A and B variants but we can just merge the same journeys on the different variants.

In [None]:
df = pd.read_csv(df_file, compression="gzip", sep='\t', encoding='utf-8')

In [None]:
# convert from str to list
df['Event_cat_act_agg']= df['Event_cat_act_agg'].progress_apply(ast.literal_eval)
df['Page_Event_List'] = df['Page_Event_List'].progress_apply(ast.literal_eval)
df['Page_List'] = df['Page_List'].progress_apply(ast.literal_eval)
df['Page_List_Length'] = df['Page_List'].progress_apply(len)


In [None]:
df.head(2)

In [None]:
df.info()

# Page_List approach
We are interested in counting each of the different types of journey given in the problem definition.

**Succesful or undisrupted journey**  
A -> X -> A

**Unsuccesful or disrupted journey**    
A -> X -> NOT A  
A -> X -> Exit

# Filtering relevant journeys

## Does a journey or Page_List include your page_of_interest?

### The problem

In [None]:
# does each journey contain page_of_interest?
df.Page_List.head(1)

### The function

Note that we don't provide page_of_interest as an argument and instead rely on the function looking for it in the global environment (we defined it at the start of the notebook).

In [None]:
def journey_of_interest(page_list):
    """Checks whether page_of_interest occurs in a page_list.
    
    Where a page_list is a journey and page_of_interest is
    a global variable.
    
    """
    if page_of_interest in page_list:
        return True
    else:
        return False

### Testing our function

In [None]:
journey_of_interest(["A", page_of_interest, "A"])

In [None]:
journey_of_interest(["A", page_of_interest, "B"])

In [None]:
journey_of_interest(["A", "A", "B"])

### Applying our function

In [None]:
df['contains_page_of_interest'] = df['Page_List'].apply(journey_of_interest)


### Count and proportion of journey types that contain page_of_interest
Remember this isn't a count of sessions, to calculate that we would need to consider the Occurrences variable. Note: if this data has A and B page variants then these counts could be misleading.

In [None]:
df.contains_page_of_interest.value_counts()

In [None]:
df.contains_page_of_interest.value_counts()[1] / (df.contains_page_of_interest.value_counts()[0] + df.contains_page_of_interest.value_counts()[1])

### Count and proportion of sessions that contain page_of_interest

In [None]:
print(f"The number of sessions that touched the page_of_interest: {df.loc[df.contains_page_of_interest == True, 'Occurrences'].values.sum()}")
print(f"The number of sessions that didn't touch the page_of_interest: {df.loc[df.contains_page_of_interest == False, 'Occurrences'].values.sum()}")
print(f"The total sessions or Occurrences: {df.Occurrences.sum()}")

In [None]:
# As a proportion
df.loc[df.contains_page_of_interest == True, 'Occurrences'].values.sum() / (df.loc[df.contains_page_of_interest == True, 'Occurrences'].values.sum() + df.loc[df.contains_page_of_interest == False, 'Occurrences'].values.sum()
)



## Filter for journeys that contain the page_of_interest
We reduce the number of rows we are working with. How many unique journeys does our data contain?

In [None]:
# As it's a logical variable we keep the rows that were True for Contains_page_of_interest
df = df[df['contains_page_of_interest']]

df.shape

And how many sessions occurred across these journeys?

In [None]:
df.Occurrences.sum()

## Where in the Page_List does the page_of_interest occurr?


We probably want to know where it happens in a journey, so that we can extract the previous page and the next page, in order to assign the journey as undistrupted or distrupted. We do that using a Pythonic [list comprehension](https://www.digitalocean.com/community/tutorials/understanding-list-comprehensions-in-python-3) approach.

In [None]:
def where_page_of_interest(page_list):
    """Return the indices of where the page of interest occurs in the page_list."""
    indices = [i for i, x in enumerate(page_list) if x == page_of_interest]
    return indices



In [None]:
df['where_page_of_interest'] = df['Page_List'].apply(where_page_of_interest)


In [None]:
# Python counts from zero, not one (the first item in a list is index 0)
df.head(5)[['Page_List', 'where_page_of_interest']]

## Is a journey disrupted by the page_of_interest?
We are interested in retrieving the pages before and after the `page_of_interest`. Given the newly created variable `where_page_of_interest` provides us with the index in the page list of where it was seen, we can simply extract the page at plus and minus one of this page, and then ask if it is the same page? If it is the same page, the user carried on their journey as usual and it was not disrupted, if the page were different then it was considered a disrupted journey. A user may also leave the site, so if no page exists beyond the `page_of_interest` we should probably count this as well.

### Does a journey end with the page_of_interest?
Does the max where_page_of_interest in a row equal the Page_List_Length minus one? (is it the last page in the journey aka the "exit page"; we minus one because of zero indexing in Python)

In [None]:
def is_page_of_interest_exit(page_list_length, where_page_of_interest):
    """Does the last page in a journey equal the page of interest?"""
    return where_page_of_interest[-1] == (page_list_length-1)

In [None]:
is_page_of_interest_exit(3, [0, 2])

In [None]:
df['page_of_interest_exit'] = df.apply(lambda row: is_page_of_interest_exit(row['Page_List_Length'], row['where_page_of_interest']) , axis = 1)
df.page_of_interest_exit.sum()

We can determine the proportion of users exiting a journey or their session on the page_of_interest as a proportion between zero and one. However, this is just a proportion of journey types rather than considering the number of Occurrences or sessions where this is true, thus it is misleading.

In [None]:
df.page_of_interest_exit.sum() / len(df.index)

Let's consider occurrences, as in how frequently this page_of_interest was associated with the end of a journey.

In [None]:
# create new variable
df['page_of_interest_exit_occurrences'] = 0
df.loc[df.page_of_interest_exit == True, 'page_of_interest_exit_occurrences'] = df['Occurrences']


From this we can calculate the number of sessons that exited on this page of interest as a proportion of all sessions that touched the page_of_interest at least once in their journey.

In [None]:
# as this data frame only includes journeys that included the page_of_interest
# we can calculate the proportion of the occurrences
df.page_of_interest_exit_occurrences.sum() / df.Occurrences.sum()

In [None]:
# df.loc[df.page_of_interest_exit == True, 'page_of_interest_exit_occurrences'].values
# mostly unique journeys, large density about one
sns.distplot(df.loc[df.page_of_interest_exit == True, 'page_of_interest_exit_occurrences'].values);



### What proportion of journeys to the page_of_interest don't return to the page they were on before?
By looking at looping (and non-looping) journeys with the page_of_interest in the middle.
#### Defining a function

In [None]:
def is_disrupted(page_list, where_page_of_interest):
    """Determines if a page_list contains any disruption and counts them.
    
    Where disruption is interuption by a page_of_interest, so that
     the user does not return to the earlier page. Returns an integer
     count of the number of disruption about a page_of_interest.
     
     """
    previous_page = [page_list[(i-1)] for i in where_page_of_interest]
    next_page = [page_list[(i+1)] for i in where_page_of_interest]
    
    adjacent_pages_list = [previous_page, next_page]
    #print(adjacent_pages_list)
    
    disruption = list(np.array(adjacent_pages_list[0]) !=  np.array(adjacent_pages_list[1]))
    #print(f"Comparing the previous page with the adjacent page reveals disruption about \
   #each occurrence of the the page_of_interest {disruption}.")
    
    disrupted = sum(map(bool, disruption))
    #print(f"Which gives a total of {disrupted} disruptive occurrences for this journey.")
    
    return disruption

#### An aside, how the list comprehension works

In [None]:
# to understand the list comprehension used, run this example
# note the i -1, it's getting the pages prior to the page of interest
[["/a", "/page_of_interest", "/b", "/page_of_interest", "/b", "/page_of_interest", "/c"][(i+1)] for i in [1, 3, 5]]

#### Does the function work as expected?

In [None]:
is_disrupted(["/a", "/page_of_interest", "/b", "/page_of_interest", "/b", "/page_of_interest", "/c"], [1, 3, 5])

#### Using the function on the journeys that do not have the page of interest at the end

We make a deep copy of our dataframe as we will be modifying it.

In [None]:
df_page_of_interest_not_last = df.loc[df.page_of_interest_exit == False].copy()
df_page_of_interest_not_last.shape

In [None]:
df_page_of_interest_not_last['disrupted'] = df_page_of_interest_not_last.apply(lambda row: is_disrupted(row['Page_List'], row['where_page_of_interest']) , axis = 1)
# gives the pattern of disruption to a journey involving at least one page_of_interest view
df_page_of_interest_not_last.disrupted.value_counts()

Journeys can have zero, one or many disruptions to the journey about the page_of_interest.

In [None]:
# count the disruptions
sum(x.count(True) for x in df_page_of_interest_not_last.disrupted)

In [None]:
# count the non-disruptions
sum(x.count(False) for x in df_page_of_interest_not_last.disrupted)

In [None]:
# the number of types of joruneys
len(df_page_of_interest_not_last.disrupted)

In [None]:
# consider journeys / occurrences affected by at least one disruption
df_page_of_interest_not_last['disrupted_at_least_once'] = df_page_of_interest_not_last.apply(lambda row: any(row['disrupted']) , axis = 1)
df_page_of_interest_not_last['disrupted_at_least_once_occurrences'] = 0
df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == True, 'disrupted_at_least_once_occurrences'] = df_page_of_interest_not_last['Occurrences']

But how does this translate into user sessions affected or disrupted at least once by the page_of_interest (not at the end of the journey)?

In [None]:
sns.distplot(df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == True, 'disrupted_at_least_once_occurrences'].values);


In [None]:
df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == True, 'disrupted_at_least_once_occurrences'].values

#### What proportion of sessions with journeys viewing the page_of_interest don't return to the page they were on before? (ignoring those journeys that had the page_of_interest at the end of the journey)

In [None]:
print(f"The number of sessions that were disrupted at least once (not at the end) and touched the page_of_interest: {df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == True, 'Occurrences'].values.sum()}")
print(f"The number of sessions that were not disrupted at least once (not at the end) and touched the page_of_interest: {df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == False, 'Occurrences'].values.sum()}")
print(f"The total sessions or Occurrences for : {df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == True, 'Occurrences'].values.sum() + df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == False, 'Occurrences'].values.sum()}")

Of those journey sessions with the page_of_interest in but not the last page viewed, what proportion of these were disrupted at least once?

In [None]:
df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == True, 'Occurrences'].values.sum() / (df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == True, 'Occurrences'].values.sum() + df_page_of_interest_not_last.loc[df_page_of_interest_not_last.disrupted_at_least_once == False, 'Occurrences'].values.sum())

This gives us the answer to our main question.

## Counting the number of disruptions in a journey including exits
Another consideration is the proportion of journeys where the page_of_interest was the last in the journey as this could give us an out of range IndexError. As we can't handle exceptions in list comprehensions this is somewhat problematic. We take the shortcut of appending a made-up page to the end of every Page_List, thus dodging the error. The logic to justify this is that we assume a journey is also disrupted if it finsishes on the page_of_interest. 

This is a stronger assumption that we would like to make, as a journey might end on the page_of_interest as the user found what they were looking for. 

In [None]:
# append an arbritary placeholder to the end of the page list to dodge this problem
# check this page does not exist on your site
#df.loc[df.page_of_interest_exit == True, 'Page_List'] = df['Page_List'].append(["/exit"])
# [["/a", "/page_of_interest", "/b", "/page_of_interest", "/b", "/page_of_interest", "/c"][(i+1)] for i in [1, 3, 5]]
# print(list(df['Page_List'])[0:][].append("/exit"))
n = df.shape[0]

df['Page_List'] = [x + [y] for x, y in zip(list(df['Page_List']), list(["/exit"]*n))]
df['Page_List'].head()

### Apply to a dataframe

In [None]:
df['disrupted'] = df.apply(lambda row: is_disrupted(row['Page_List'], row['where_page_of_interest']) , axis = 1)
df.disrupted.head(3)

### Proportion of journeys that are disupted

In [None]:
df.disrupted.value_counts()

In [None]:
# count the disruptions
sum(x.count(True) for x in df.disrupted)

In [None]:
# count the non-disruptions
sum(x.count(False) for x in df.disrupted)

In [None]:
# the number of types of joruneys
len(df.disrupted)

In [None]:
# consider journeys / occurrences affected by at least one disruption
df['disrupted_at_least_once'] = df.apply(lambda row: any(row['disrupted']) , axis = 1)
df['disrupted_at_least_once_occurrences'] = 0
df.loc[df.disrupted_at_least_once == True, 'disrupted_at_least_once_occurrences'] = df['Occurrences']

In [None]:
sns.distplot(df.loc[df.disrupted_at_least_once == True, 'disrupted_at_least_once_occurrences'].values);


In [None]:
df.loc[df.disrupted_at_least_once == True, 'disrupted_at_least_once_occurrences'].values

If we consider a journey that ends with our page_of_interest as disrupted then we get the following proportion of journeys that contain the page_of_interest as being disrupted at least once.

In [None]:
df.loc[df.disrupted_at_least_once == True, 'Occurrences'].values.sum() / (df.loc[df.disrupted_at_least_once == True, 'Occurrences'].values.sum() + df.loc[df.disrupted_at_least_once == False, 'Occurrences'].values.sum())

## Conclusion
* Was the page of interest disruptive?  
* Was it often associated with users exiting?  
* Did it cause much disruption to users journeys?  