In [None]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()
from datetime import datetime
from scipy.stats import ttest_ind

# A/B test analysis

## Assignments

In [None]:
data = pd.read_csv("data/assignments.csv")

In [None]:
data.head()

In [None]:
print(datetime.strptime(data.head(1)['ts'][0], '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))

In [None]:
data['dt'] = data['ts'].map(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.groupby(['groupid']).count()

In [None]:
data_count = data.groupby(['groupid','dt']).count().reset_index()

In [None]:
data_count.head()

In [None]:
alt.Chart(data_count).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('userid'),
    color='groupid:O',
    tooltip=['userid']
).properties(
    width=600,
    height=400
)

## Pre-test metrics

### User activity

In [None]:
data_act = pd.read_csv("data/activity_all.csv")

In [None]:
data_act.head()

In [None]:
data_act.groupby(['groupid','dt']).describe()

In [None]:
data_act.query('activity_level > 0').groupby(['dt', 'groupid']).count().reset_index().head()

In [None]:
alt.Chart(data_act.query('activity_level > 0').groupby(['dt', 'groupid']).count().reset_index()).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('userid'),
    color='groupid:O',
    tooltip=['userid']
).properties(
    width=600,
    height=400
)

In [None]:
(
    data_act.query('activity_level > 0 and groupid == 0 and dt >= "2021-11-01"')
    .groupby(['dt','groupid']).count().reset_index()[['groupid','activity_level']].describe()
)

In [None]:
(
    data_act.query('activity_level > 0 and groupid == 1 and dt >= "2021-11-01"')
    .groupby(['dt','groupid']).count().reset_index()[['groupid','activity_level']].describe()
)

In [None]:
data_act.query('dt >= "2021-11-01"').groupby(['groupid']).describe()

In [None]:
data_act.query('dt < "2021-11-01"').groupby('groupid').describe()

In [None]:
data_act_count = data_act.query('activity_level > 0').groupby(['groupid','dt']).count().reset_index()

In [None]:
data_act_count.head()

In [None]:
alt.Chart(data_act_count).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('userid'),
    color='groupid:O',
    tooltip=['userid']
).properties(
    width=600,
    height=400
)

### Comparing the activity between the groups

By the activity levels

In [None]:
data_act.query('groupid == 0')['activity_level'].to_numpy()

In [None]:
res = ttest_ind(data_act.query('groupid == 0 and dt >= "2021-11-01"')['activity_level'].to_numpy(),
                data_act.query('groupid == 1 and dt >= "2021-11-01"')['activity_level'].to_numpy()).pvalue

print(res)

In [None]:
"{:.100f}".format(res)

By the number of active users

In [None]:
before = data_act_count.query('dt < "2021-11-01"')

In [None]:
after = data_act_count.query('dt >= "2021-11-01"')

In [None]:
before.head()

Checking for the pretest bias on activity.

In [None]:
np.mean(before.query('groupid == 0')['userid'].to_numpy())

In [None]:
np.mean(before.query('groupid == 1')['userid'].to_numpy())

In [None]:
res = ttest_ind(before.query('groupid == 0')['userid'].to_numpy(), before.query('groupid == 1')['userid']
                .to_numpy()).pvalue

print(res)

In [None]:
"{:.100f}".format(res)

In [None]:
np.mean(after.query('groupid == 0')['userid'].to_numpy())

In [None]:
np.mean(after.query('groupid == 1')['userid'].to_numpy())

In [None]:
res = ttest_ind(after.query('groupid == 0')['userid'].to_numpy(), after.query('groupid == 1')['userid']
                .to_numpy()).pvalue

print(res)

In [None]:
"{:.100f}".format(res)

### Click through rate (CTR)

In [None]:
data_ctr = pd.read_csv("data/ctr_all.csv")

In [None]:
data_ctr.head()

In [None]:
data_ctr_avg = data_ctr.groupby(['groupid','dt']).mean().reset_index()

In [None]:
alt.Chart(data_ctr_avg).mark_line(size=5).encode(
    alt.X('dt'),
    alt.Y('ctr'),
    color='groupid:O',
    tooltip=['ctr']
).properties(
    width=600,
    height=400
)

In [None]:
before = data_ctr.query('dt < "2021-11-01"')[['groupid', 'ctr']]

In [None]:
after = data_ctr.query('dt >= "2021-11-01"')[['groupid', 'ctr']]

In [None]:
after

In [None]:
before.query('groupid == 0')['ctr'].to_numpy().mean()

In [None]:
before.query('groupid == 1')['ctr'].to_numpy().mean()

In [None]:
after.query('groupid == 0')['ctr'].to_numpy().mean()

In [None]:
after.query('groupid == 1')['ctr'].to_numpy().mean()

In [None]:
before.query('groupid == 0')['ctr'].to_numpy().std()

In [None]:
before.query('groupid == 1')['ctr'].to_numpy().std()

In [None]:
after.query('groupid == 0')['ctr'].to_numpy().std()

In [None]:
after.query('groupid == 1')['ctr'].to_numpy().std()

In [None]:
res = ttest_ind(before.query('groupid == 0')['ctr'].to_numpy(), before.query('groupid == 1')['ctr']
                .to_numpy()).pvalue

print(res)

In [None]:
res = ttest_ind(after.query('groupid == 0')['ctr'].to_numpy(), after.query('groupid == 1')['ctr']
                .to_numpy()).pvalue
print(res)

In [None]:
"{:.100f}".format(res)