In [1]:
import numpy as np

In [44]:
def build_transition_dict(file: str) -> dict:
    ''' Count the transitions from page s (start) to e (end)
        and return a dictionary with key = (s, e) and value = transictions count.'''
    transitions = {}
    trans_outgoing = {}
    for line in open(file):
        s, e = line.rstrip().split(',')
        transitions[(s, e)] = transitions.get((s, e), 0) + 1
        trans_outgoing[s] = trans_outgoing.get(s, 0) + 1
    # normalise over number of outgoing transacions
    for (s, e), v in transitions.items():
        transitions[(s, e)] = v / trans_outgoing[s]
    return transitions

In [45]:
# Create transition dictionary from input file.
# key = (s, e), where s,e are the codes of starting and ending page respectively.
# value = transition frequency, w.r.t. total transitions from s.
# special keys:
#   s="-1" means that e is the landing page
#   e="B" means that the user left the website from page s (Bounce)
#   e="C" means that the user completed a transaction from page s (Conversion)
transition_dict = count_transitions('./data/site_data.csv')

In [58]:
# Landing page rate
e_max, e_max_freq = ('', 0)
for (s, e), v in transition_dict.items():
    if s == '-1':  # denoting that e is a landing page
        print(f"{e}: {v*100:.1f}%")
        if v > e_max_freq:
            e_max = e
            e_max_freq = v
print(f"most frequent: {e_max} ({e_max_freq*100:.1f}%)")

8: 10.2%
2: 9.5%
5: 9.8%
9: 10.4%
0: 10.3%
6: 9.8%
7: 10.0%
1: 10.3%
4: 10.2%
3: 9.5%
most frequent: 9 (10.4%)


In [59]:
# Bouncing rate
b_max, b_max_freq = ('', 0)
for (s, e), v in transition_dict.items():
    if e == 'B':
        print(f"{s}: {v*100:.1f}%")
        if v > max_count:
            b_max = s
            b_max_freq = v
print(f"most frequent: {b_max} ({b_max_freq*100:.1f}%)")

1: 12.6%
2: 12.6%
8: 12.5%
6: 12.1%
7: 12.4%
3: 12.7%
4: 12.6%
5: 12.4%
0: 12.8%
9: 13.2%
most frequent: 9 (13.2%)


In [63]:
# Conversion rate
c_max, c_max_freq = ('', 0)
for (s, e), v in transition_dict.items():
    if e == 'C':
        print(f"{s}: {v*100:.1f}%")
        if v > max_count:
            c_max = s
            c_max_freq = v
print(f"most frequent: {c_max} ({c_max_freq*100:.1f}%)")

7: 12.2%
3: 12.1%
9: 12.1%
0: 12.0%
4: 12.6%
2: 12.3%
1: 12.7%
6: 12.2%
8: 12.8%
5: 12.8%
most frequent: 5 (12.8%)
