In [10]:
import os 
import pandas as pd
import numpy as np
from ast import literal_eval
from collections import Counter
from operator import itemgetter

## 1. Read in data/drop columns to reduce size

In [11]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "preprocessed_taxon_pageseq_20190114_20190116.csv.gz"
df_file = os.path.join(DATA_DIR, "processed_journey", filename)
df_reduced_file = os.path.join(DATA_DIR, "processed_journey", "reduced_"+filename)
# df_rel_file = os.path.join(DATA_DIR, "processed_journey", "rel_"+filename)

### Load up unfiltered data: 14-16/01/19, one off run

Original dataset shape (5048130, 17)

In [12]:
# df = pd.read_csv(df_file, sep="\t", compression = "gzip")
# print(df.shape)
# df.drop(['Dates', 'Page_List', 'Event_List', 'Taxon_List', 'Taxon_Page_List', 'Page_List_NL'],axis=1,inplace=True)
# df.to_csv(df_reduced_file, sep="\t", compression="gzip", index=False)

## 2. User journey analysis

In [None]:
df = pd.read_csv(df_reduced_file, sep="\t", compression="gzip")

In [None]:
## Total volume of traffic
df.Occurrences.sum()

In [None]:
def column_eval(cols):
    for column in cols:
        if not isinstance(df[column].iloc[0],list):
            print(column)
            df[column] = df[column].map(literal_eval)

In [None]:
columns = ['Page_Event_List', 'DeviceCategories', 'Event_cats_agg', 'Event_cat_act_agg']
column_eval(columns)

### 2.1 Out of all journeys, what is the % (volume, sum of Occurrences) of journeys including "related content" link clicks?

In [None]:
df[(df.Sequence.str.contains("Related content")) & (df.Sequence.str.contains("relatedLinkClicked"))].shape

In [None]:
"{}% of journeys that contain a \"related link\" click".format(round((473975*100)/df.Occurrences.sum(),2))

Run this at some point to compare against "related content" link-specific dataset. 
There seems to be an indication that there are no major differences

### 2.2 Out of all journeys coming from mobiles vs desktops, what is the % (volume, sum of Occurrences) of journeys including "related content" link clicks?

In [None]:
def more_device(x,device):
    return max(x,key=itemgetter(1))[0]==device 
def device_count(x,device):
    return sum([value for item, value in x if item==device])      

In [None]:
df["DesktopCount"] = df['DeviceCategories'].map(lambda x: device_count(x,"desktop"))
df["MobileCount"] = df['DeviceCategories'].map(lambda x: device_count(x,"mobile"))        

In [None]:
df["TabletCount"] = df['DeviceCategories'].map(lambda x: device_count(x,"tablet"))        

In [None]:
df["DesktopCount"].describe()

### 2.3 Traffic volume
#### Test implementation

In [None]:
related = df[(df.Sequence.str.contains("Related content")) & (df.Sequence.str.contains("relatedLinkClicked"))].iloc[0]

In [None]:
related

In [None]:
explore = df[~(df.Sequence.str.contains("Related content")) & (df.Sequence.str.contains("relatedLinkClicked"))].iloc[0]

In [None]:
explore

In [None]:
explore.Event_cat_act_agg

In [None]:
#Compute whether a journey includes a related link click
def is_related(x):
    return all(cond in x for cond in ["relatedLinkClicked","Related content"])

In [None]:
print(is_related(related.Sequence))
print(is_related(df.Sequence.iloc[0]))
print(is_related(explore))

In [None]:
df["Has_Related"] = df["Sequence"].map(is_related)

In [None]:
df[df["Has_Related"]].Occurrences.sum()

### 2.4 Journeys per device
Counter({'desktop': 256791, 'tablet': 55546, 'mobile': 161638})

In [None]:
## Number of journeys coming from desktops
vol_desk = df["DesktopCount"].sum()
"{}% of journeys come from a desktop".format(round((vol_desk*100)/df.Occurrences.sum(),2))

In [None]:
## Number of journeys coming from mobiles
vol_mobile = df["MobileCount"].sum()
"{}% of journeys come from a mobile".format(round((vol_mobile*100)/df.Occurrences.sum(),2))

In [None]:
## Number of journeys coming from tablets
vol_tablet = df["TabletCount"].sum()
"{}% of journeys come from a tablet".format(round((vol_tablet*100)/df.Occurrences.sum(),2))

In [None]:
## Seperate out desktop and mobile journeys
desktop_journeys = df[df.DesktopCount>0].copy(deep=True)
mobile_journeys = df[df.MobileCount>0].copy(deep=True)

In [None]:
## Compute number of journeys from specific device that include related links
## Don't base counting on occurrences, will include excluded device
vol_desk_rel = desktop_journeys[desktop_journeys.Has_Related].DesktopCount.sum()
vol_mobile_rel = mobile_journeys[mobile_journeys.Has_Related].MobileCount.sum()

In [None]:
"{}% of desktop journeys include a \"Related content\" click".format(round((vol_desk_rel*100)/vol_desk,2))

In [None]:
"{}% of mobile journeys include a \"Related content\" click".format(round((vol_mobile_rel*100)/vol_mobile,2))

### 2.5 Chi-squared test
Compute observations for a contingency table

In [None]:
from scipy import stats

In [None]:
obs = [[vol_mobile_rel,(vol_mobile-vol_mobile_rel)], [vol_desk_rel, (vol_desk-vol_desk_rel)]]
print(obs)
print(vol_mobile==sum(obs[0]), vol_desk==sum(obs[1]))

In [None]:
chi2, p, dof, ex = stats.chi2_contingency(obs)
chi2, p, dof, ex

In [None]:
g, p, dof, expctd = stats.chi2_contingency(obs, lambda_="log-likelihood")
g, p, dof, expctd

### 2.6 User journey length

In [None]:
def weight_seq_length(page_lengths, occurrences, name):
    length_occ = Counter()
    for length,occ in zip(page_lengths, occurrences):
           length_occ[length]+=occ
    data = []
    for key,value in length_occ.items():
        for i in range(value):
            data.append(key)
    return pd.Series(data,name=name)

In [None]:
# weight_seq_length(df.PageSeq_Length,df.Occurrences).describe().apply(lambda x: format(x, 'f'))
# weight_seq_length(df[df.Has_Related].PageSeq_Length,df[df.Has_Related].Occurrences).describe().apply(lambda x: format(x, 'f'))

### Per device

In [None]:
# weight_seq_length(desktop_journeys.PageSeq_Length, desktop_journeys.DesktopCount).describe().apply(lambda x: format(x, 'f'))
# weight_seq_length(mobile_journeys.PageSeq_Length, mobile_journeys.MobileCount).describe().apply(lambda x: format(x, 'f'))

### Per device + has_related

In [None]:
desk_rel_journeys = desktop_journeys[desktop_journeys.Has_Related]
mobile_rel_journeys = mobile_journeys[mobile_journeys.Has_Related]

In [None]:
# weight_seq_length(desk_rel_journeys.PageSeq_Length, desk_rel_journeys.DesktopCount).describe().apply(lambda x: format(x, 'f'))
# weight_seq_length(mobile_rel_journeys.PageSeq_Length, mobile_rel_journeys.MobileCount).describe().apply(lambda x: format(x, 'f'))

#### Everything together

In [None]:
def describe_dfs(to_eval):
    descriptive = pd.DataFrame()
    for length,occ,name in to_eval:
        sr = weight_seq_length(length,occ,name).describe().apply(lambda x: format(x, '.3f'))
        descriptive[sr.name] = sr
    return descriptive

In [None]:
list_of_cols = [[df.PageSeq_Length,df.Occurrences, "All_Journeys"],
                [df[df.Has_Related].PageSeq_Length,df[df.Has_Related].Occurrences, "All_Journeys_Related"],
                [desktop_journeys.PageSeq_Length,desktop_journeys.DesktopCount,"All_Desktop"],
                [mobile_journeys.PageSeq_Length, mobile_journeys.MobileCount, "All_Mobile"],
                [desk_rel_journeys.PageSeq_Length, desk_rel_journeys.DesktopCount, "Desktop_Related"],
                [mobile_rel_journeys.PageSeq_Length, mobile_rel_journeys.MobileCount, "Mobile_Related"]]

In [None]:
describe_dfs(list_of_cols)

#### Exclude user journeys of length 1

In [None]:
df_dlo = df[df.PageSeq_Length > 1]

In [None]:
## Seperate out desktop and mobile journeys
desktop_journeys_dlo = df_dlo[df_dlo.DesktopCount>0].copy(deep=True)
mobile_journeys_dlo =  df_dlo[df_dlo.MobileCount>0].copy(deep=True)
desk_rel_journeys_dlo = desktop_journeys_dlo[desktop_journeys_dlo.Has_Related]
mobile_rel_journeys_dlo = mobile_journeys_dlo[mobile_journeys_dlo.Has_Related]

In [None]:
list_of_columns = [[df_dlo.PageSeq_Length,df_dlo.Occurrences],
                   [df_dlo.PageSeq_Length,df_dlo.Occurrences]
                   [desktop_journeys_dlo.PageSeq_Length,desktop_journeys_dlo.DesktopCount,"All_Desktop"],
                   [mobile_journeys_dlo.PageSeq_Length, mobile_journeys_dlo.MobileCount, "All_Mobile"],
                   [desk_rel_journeys_dlo.PageSeq_Length, desk_rel_journeys_dlo.DesktopCount, "Desktop_Related"],
                   [mobile_rel_journeys_dlo.PageSeq_Length, mobile_rel_journeys_dlo.MobileCount, "Mobile_Related"]]

In [None]:
describe_dfs(list_of_cols)