In [1]:
%%markdown
# A/B testing Data Generation and Experiments 

# A/B testing Data Generation and Experiments 


In [2]:
import pandas as pd 
import numpy as np 
from random import random
import scipy.stats as scs
import pandas as pd
import datetime
import math
import statsmodels.api as sm
import scipy.stats.distributions as dist

In [3]:
%%markdown
## Data Generation

## Data Generation


In [4]:
np.random.seed(10)

In [5]:
def random_dates(start, end, n=10):
    start_u = start.value//10**9
    end_u = end.value//10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

In [6]:
start = pd.to_datetime('2020-01-01')
end = pd.to_datetime('2020-01-03')
dates = random_dates(start, end,n=10000)
customer_ids = [i for i in range(1,10001)]
df = pd.DataFrame(columns=['Customer Id','Date'])
df['Customer Id'] = customer_ids
df['Date'] = dates
df = df.sort_values('Date')
group = []
for i in range(len(df)):
    a = 'A' if 0.501>np.random.random() else 'B'
    group.append(a)
df['Group'] = group
df['Conversion Rate'] = 0.05
df['Time'] = [datetime.datetime.time(i) for i in df['Date']]
df['Conversion Status'] = 0

In [7]:
base_conversion = 0.05
morning_conversion = 0.02
midnight_conversion = 0.05
evening_conversion = 0.1

In [8]:
df.loc[(df['Group'] == 'B') & (df['Time'] >= datetime.time(8,0,0)) &
       (df['Time'] < datetime.time(16,0,0)),'Conversion Rate'] = morning_conversion

df.loc[(df['Group'] == 'B') &(df['Time'] >= datetime.time(0,0,0)) &
       (df['Time'] < datetime.time(8,0,0)),'Conversion Rate'] = midnight_conversion

df.loc[(df['Group'] == 'B') & (df['Time'] >= datetime.time(16,0,0)) &
       (df['Time'] < datetime.time(23,59,59)),'Conversion Rate'] = evening_conversion

status = []
for index,rows in df.iterrows():
    a = 1 if rows['Conversion Rate']>np.random.random() else 0
    status.append(a)
df['Conversion Status'] = status
#df = df.drop('Time',axis=1)

In [9]:
df

Unnamed: 0,Customer Id,Date,Group,Conversion Rate,Time,Conversion Status
8850,8851,2020-01-01 00:00:21,B,0.05,00:00:21,0
6045,6046,2020-01-01 00:00:50,A,0.05,00:00:50,0
1973,1974,2020-01-01 00:00:50,B,0.05,00:00:50,0
6181,6182,2020-01-01 00:01:28,A,0.05,00:01:28,0
8107,8108,2020-01-01 00:01:46,A,0.05,00:01:46,0
...,...,...,...,...,...,...
7268,7269,2020-01-02 23:59:12,B,0.10,23:59:12,0
2012,2013,2020-01-02 23:59:15,B,0.10,23:59:15,0
9062,9063,2020-01-02 23:59:19,A,0.05,23:59:19,0
9203,9204,2020-01-02 23:59:36,A,0.05,23:59:36,0


In [10]:
%%markdown
# Experiment 1
## A/B testing across all days and complete population. 

# Experiment 1
## A/B testing across all days and complete population. 


In [11]:
df_summary = df.pivot_table(values='Conversion Status', index='Group', aggfunc=np.sum)
df_summary['total'] = df.pivot_table(values='Conversion Status', index='Group', aggfunc=lambda x: len(x))
df_summary['rate'] = df.pivot_table(values='Conversion Status', index='Group')
df_summary

Unnamed: 0_level_0,Conversion Status,total,rate
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,247,4938,0.05002
B,285,5062,0.056302


In [12]:
%%markdown
## Our two group are currently binomial distributions. 

## Our two group are currently binomial distributions. 


In [13]:
p1 =df_summary.rate.A #proportion of conversion of control group. 
p2 = df_summary.rate.B #proportion of conversion of treatment group.
n1 = df_summary.total.A
n2 = df_summary.total.B
p = (df_summary['Conversion Status'].A + df_summary['Conversion Status'].B)/(n1+n2) #proportion of conversion of full poppulation.
q1 = 1 - p1 
q2 = 1 - p2 
se = np.sqrt(p*(1-p)*(1/n1 + 1/n2))
print(q1,q2,se,p)

0.9499797488861887 0.9436981430264717 0.004488986791931093 0.0532


In [14]:
%%markdown
### Setting Significance Level = 0.01 

### Null Hypothesis: p1 - p2 = 0 
This is assuming that there is no significant difference between the proportions of the two groups. 

### Alternate Hypothesis: p2 != p1 
We are performing a Two Tailed test to see if the the conversion rate of the treatment group is statistically different from the control group's. 

### Setting Significance Level = 0.01 

### Null Hypothesis: p1 - p2 = 0 
This is assuming that there is no significant difference between the proportions of the two groups. 

### Alternate Hypothesis: p2 != p1 
We are performing a Two Tailed test to see if the the conversion rate of the treatment group is statistically different from the control group's. 


In [15]:
# Best Estimate
be = p1 - p2 

# Test Statistic
ts = be/se
print(ts)

-1.3993371223564544


In [16]:
p_value = 2*dist.norm.cdf(-np.abs(ts))
print(p_value)

0.16171191256845063


In [17]:
%%markdown
Our p_Value >> Significance Level. 
### Therefore, we fail to reject our Null Hypothesis. 
### Inference: The two proportions are not significantly different and hence the treatment is not effective.

Our p_Value >> Significance Level. 
### Therefore, we fail to reject our Null Hypothesis. 
### Inference: The two proportions are not significantly different and hence the treatment is not effective.


In [18]:
%%markdown
# Experiment 2
## A/B testing across all days and only morning 

# Experiment 2
## A/B testing across all days and only morning 


In [19]:
df_mornings = df[(df['Time'] >= datetime.time(8,0,0)) &
       (df['Time'] < datetime.time(16,0,0))]

In [20]:
df_summary = df_mornings.pivot_table(values='Conversion Status', index='Group', aggfunc=np.sum)
df_summary['total'] = df_mornings.pivot_table(values='Conversion Status', index='Group', aggfunc=lambda x: len(x))
df_summary['rate'] = df_mornings.pivot_table(values='Conversion Status', index='Group')
df_summary

Unnamed: 0_level_0,Conversion Status,total,rate
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,92,1707,0.053896
B,41,1735,0.023631


In [21]:
p1 =df_summary.rate.A #proportion of conversion of control group. 
p2 = df_summary.rate.B #proportion of conversion of treatment group.
n1 = df_summary.total.A
n2 = df_summary.total.B
p = (df_summary['Conversion Status'].A + df_summary['Conversion Status'].B)/(n1+n2) #proportion of conversion of full poppulation.
q1 = 1 - p1 
q2 = 1 - p2 
se = np.sqrt(p*(1-p)*(1/n1 + 1/n2))
print(q1,q2,se,p)

0.9461042765084944 0.9763688760806917 0.006570558267313205 0.03864032539221383


In [22]:
%%markdown
### Setting Significance Level = 0.01 

### Null Hypothesis: p1 - p2 = 0 
This is assuming that there is no significant difference between the proportions of the two groups. 

### Alternate Hypothesis: p2 != p1 
We are performing a Two Tailed test to see if the the conversion rate of the treatment group is statistically different from the control group's. 

### Setting Significance Level = 0.01 

### Null Hypothesis: p1 - p2 = 0 
This is assuming that there is no significant difference between the proportions of the two groups. 

### Alternate Hypothesis: p2 != p1 
We are performing a Two Tailed test to see if the the conversion rate of the treatment group is statistically different from the control group's. 


In [23]:
# Best Estimate
be = p1 - p2 

# Test Statistic
ts = be/se
print(ts)

4.606092563360348


In [24]:
p_value = 2*dist.norm.cdf(-np.abs(ts))
print('{:f}'.format(p_value))

0.000004


In [25]:
%%markdown
Our p_Value < Significance Level. 
### Therefore, we accept our alternative Hypothesis. 
### Inference: The two proportions are significantly different, thus the treatment group does change the proportion of conversions. 

Our p_Value < Significance Level. 
### Therefore, we accept our alternative Hypothesis. 
### Inference: The two proportions are significantly different, thus the treatment group does change the proportion of conversions. 


In [26]:
%%markdown
# Experiment 3
## A/B testing across all days and only Evenings. 

# Experiment 3
## A/B testing across all days and only Evenings. 


In [27]:
df_evenings = df[(df['Time'] >= datetime.time(16,0,0)) &
       (df['Time'] < datetime.time(23,59,59))]

In [28]:
df_summary = df_evenings.pivot_table(values='Conversion Status', index='Group', aggfunc=np.sum)
df_summary['total'] = df_evenings.pivot_table(values='Conversion Status', index='Group', aggfunc=lambda x: len(x))
df_summary['rate'] = df_evenings.pivot_table(values='Conversion Status', index='Group')
df_summary

Unnamed: 0_level_0,Conversion Status,total,rate
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,82,1635,0.050153
B,171,1681,0.101725


In [29]:
p1 =df_summary.rate.A #proportion of conversion of control group. 
p2 = df_summary.rate.B #proportion of conversion of treatment group.
n1 = df_summary.total.A
n2 = df_summary.total.B
p = (df_summary['Conversion Status'].A + df_summary['Conversion Status'].B)/(n1+n2) #proportion of conversion of full poppulation.
q1 = 1 - p1 
q2 = 1 - p2 
se = np.sqrt(p*(1-p)*(1/n1 + 1/n2))
print(q1,q2,se,p)

0.9498470948012232 0.8982748364069006 0.009221121890509346 0.07629674306393244


In [30]:
%%markdown
### Setting Significance Level = 0.01 

### Null Hypothesis: p1 - p2 = 0 
This is assuming that there is no significant difference between the proportions of the two groups. 

### Alternate Hypothesis: p2 != p1 
We are performing a Two Tailed test to see if the the conversion rate of the treatment group is statistically different from the control group's. 

### Setting Significance Level = 0.01 

### Null Hypothesis: p1 - p2 = 0 
This is assuming that there is no significant difference between the proportions of the two groups. 

### Alternate Hypothesis: p2 != p1 
We are performing a Two Tailed test to see if the the conversion rate of the treatment group is statistically different from the control group's. 


In [31]:
# Best Estimate
be = p1 - p2 

# Test Statistic
ts = be/se
print(ts)

-5.592839895913565


In [32]:
p_value = 2*dist.norm.cdf(-np.abs(ts))
print('{:.10f}'.format(p_value))

0.0000000223


In [33]:
%%markdown
Our p_Value < Significance Level. 
### Therefore, we accept our alternative Hypothesis. 
### Inference: The two proportions are significantly different, thus the treatment group does change the proportion of conversions. 

Our p_Value < Significance Level. 
### Therefore, we accept our alternative Hypothesis. 
### Inference: The two proportions are significantly different, thus the treatment group does change the proportion of conversions. 


In [34]:
df.drop(['Time','Conversion Rate'],axis=1)

Unnamed: 0,Customer Id,Date,Group,Conversion Status
8850,8851,2020-01-01 00:00:21,B,0
6045,6046,2020-01-01 00:00:50,A,0
1973,1974,2020-01-01 00:00:50,B,0
6181,6182,2020-01-01 00:01:28,A,0
8107,8108,2020-01-01 00:01:46,A,0
...,...,...,...,...
7268,7269,2020-01-02 23:59:12,B,0
2012,2013,2020-01-02 23:59:15,B,0
9062,9063,2020-01-02 23:59:19,A,0
9203,9204,2020-01-02 23:59:36,A,0
