In [None]:
import os 
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import re
import tqdm

from collections import Counter

# progress bar
from tqdm import tqdm, tqdm_notebook
# instantiate progress bar goodness
tqdm.pandas(tqdm_notebook)

# print long str
pd.set_option('max_colwidth',500)

In [None]:
page_of_interest = "/help/cookies"

# The problem
We are interested in inspecting user journeys to determine whether a specific page is disruptive to those journeys, let's call it Page X. We use some pseudocode art to describe what we mean informerly:

**Succesful or undisrupted journey**  
A -> X -> A

**Unsuccesful or disrupted journey**    
A -> X -> NOT A  
A -> X -> Exit

Where `A` is a node in a journey immediately prior to `X` (`A` can be any page except `X`). A succesful or not disrupted journey, will be looped, that is a user will travel to `X` from `A`, then return back to `A` again, continuing their journey undisrupted. An unsuccesful journey or disrupted journey is considered to be anything else, including the user leaving the site.

# Using this notebook
This notebook is written for those new to Python. Accordingly, we don't always use the most Pythonic or efficient code. Instead we opt for code that is most explicit and easy to follow with lots of examples.

# File/dir locations


We use a recent processed_journey dataset derived from using this repo.

In [None]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "full_sample_taxon_ab_2019_947858.csv.gz"
df_file = os.path.join(
    DATA_DIR, "processed_journey",
    filename)

print(df_file)

Load up a data file that isn't too large. Here we use a sampled dataset from a week's worth of data of the 21-27 Jan 2019. It includes A and B variants but we can just merge the same journeys on the different variants.

In [None]:
df = pd.read_csv(df_file, compression="gzip", sep='\t', encoding='utf-8')

In [None]:
# convert from str to list
df['Event_cat_act_agg']= df['Event_cat_act_agg'].progress_apply(ast.literal_eval)
df['Page_Event_List'] = df['Page_Event_List'].progress_apply(ast.literal_eval)
df['Page_List'] = df['Page_List'].progress_apply(ast.literal_eval)
df['Page_List_Length'] = df['Page_List'].progress_apply(len)


In [None]:
df.head(2)

In [None]:
df.info()

# Page_List approach
We are interested in counting each of the different types of journey given in the problem definition.

**Succesful or undisrupted journey**  
A -> X -> A

**Unsuccesful or disrupted journey**    
A -> X -> NOT A  
A -> X -> Exit

# Filtering relevant journeys

## Does a journey or Page_List include your page_of_interest?

In [None]:
# does each journey contain page_of_interest?
df.Page_List.head(1)

In [None]:
def journey_of_interest(page_list):
    """Checks whether page_of_interest occurs in a page_list.
    
    Where a page_list is a journey and page_of_interest is
    a global variable.
    
    """
    if page_of_interest in page_list:
        return True
    else:
        return False

In [None]:
journey_of_interest(["A", page_of_interest, "A"])

In [None]:
journey_of_interest(["A", page_of_interest, "B"])

In [None]:
journey_of_interest(["A", "A", "B"])

In [None]:
df['contains_page_of_interest'] = df['Page_List'].apply(journey_of_interest)


In [None]:
df.contains_page_of_interest.value_counts()

## Filter for journeys that contain the page_of_interest
We reduce the number of rows we are working with.

In [None]:
# As it's a logical variable we keep the rows that were True for Contains_page_of_interest
df = df[df['contains_page_of_interest']]

df.shape

## Where in the Page_List does the page_of_interest occurr?


We probably want to know where it happens in a journey, so that we can extract the previous page and the next page, in order to assign the journey as undistrupted or distrupted.

In [None]:
def where_page_of_interest(page_list):
    """Return the indices of where the page of interest occurs in the page_list."""
    indices = [i for i, x in enumerate(page_list) if x == page_of_interest]
    return indices



In [None]:
df['where_page_of_interest'] = df['Page_List'].apply(where_page_of_interest)


In [None]:
df.head(2)