In [211]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import norm
import scipy.stats as stats
import statsmodels.stats.api as sms

In [212]:
df = pd.read_csv("/Users/amastikbayev/Downloads/ab_data.csv")

In [213]:
# checking for missing values, 0 means that we have no missing values in the data
df.isna().sum() #checking for not available data in particular

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [214]:
# checking for missing values, 0 means that we have no missing values in the data
df.isnull().sum() #checking for exactly null values

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [215]:
df.shape #checking for the number of rows and columns we have in the data

(294478, 5)

In [216]:
df.head(10)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
5,936923,2017-01-10 15:20:49.083499,control,old_page,0
6,679687,2017-01-19 03:26:46.940749,treatment,new_page,1
7,719014,2017-01-17 01:48:29.539573,control,old_page,0
8,817355,2017-01-04 17:58:08.979471,treatment,new_page,1
9,839785,2017-01-15 18:11:06.610965,treatment,new_page,1


In [217]:
# user_id is a column with a unique number for each user
# timestamp is a column with time when user visited the page
# group divides for treatment that sees the new page and control that see the old page
# landing_page is whether the user saw the old page or new page
# converted (1) means that user purchased the product, 0 means did not purchase

In [218]:
pd.crosstab(df['group'], df['landing_page'])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1928,145274
treatment,145311,1965


In [219]:
# we can see that there is a missalignment between numbers since there 
# should be equal data between new_page AND control with old_page AND treatment and vice virsa 
# so we should clean the data

In [220]:
# data cleaning
# we should drop the rows where the users are in control group and see new page
# also we drop rows with treatment group users seeing old page

df = df.drop(df[(df.landing_page == "new_page") & (df.group == 'control')].index)
df = df.drop(df[(df.landing_page == "old_page") & (df.group == 'treatment')].index)

df.shape

(290585, 5)

In [221]:
#our row numbers are dropped and now we can check if it went correctly

pd.crosstab(df['group'], df['landing_page'])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0,145274
treatment,145311,0


In [222]:
#perfect now checking for duplicates
df.nunique()

user_id         290584
timestamp       290585
group                2
landing_page         2
converted            2
dtype: int64

In [223]:
#there is one duplicate in user_id so we drop it
df = df.drop_duplicates(subset = ["user_id"])
df.shape

(290584, 5)

In [224]:
#this is our final dataset to work with

# lets see the problem

# I am a product analyst of the website that sells the good, I need to check the conversion rate

# of the old page and see whether the new page will have a better conversion rate

# and wheter it will be statistically significant that the new page will be better selling

# I set standard statistical significance at 5% and a min difference of 2% of conversion rate

# in order to invest in the new page

# we also set the statistical power at 0.8 as a default value

In [225]:
# old page conversion rate is 12% while the new one should be atleast 14%

# lets randomly select a sample from each group and see wheter there will be a difference of at least

# 2%, so with this information we can count necessary sample size we should get

In [226]:
# Calculating effect size based on our expected rates

effect_size = sms.proportion_effectsize(0.12, 0.14)    

effect_size

-0.05951079608252363

In [227]:
from math import ceil

# Calculating sample size needed
required_n = sms.NormalIndPower().solve_power(effect_size, power=0.8,  alpha=0.05,ratio=1)  

required_n = ceil(required_n)                         

print(required_n) 

4433


In [228]:
import random
seed=18 
old_group = df.query('group == "control"').sample(required_n, random_state=seed)
new_group = df.query('group == "treatment"').sample(required_n, random_state=seed)

In [229]:
df_sample = pd.concat([old_group, new_group],axis=0)
df_sample.group.value_counts()

group
control      4433
treatment    4433
Name: count, dtype: int64

In [230]:
import warnings
warnings.filterwarnings("ignore")

conversion_rates = df_sample.groupby('group')['converted'].agg(np.mean)
conversion_rates

group
control      0.118656
treatment    0.123844
Name: converted, dtype: float64

In [231]:
# lets analyse the results

In [232]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

old_results = df_sample[df_sample['group'] == 'control']['converted']

new_results = df_sample[df_sample['group'] == 'treatment']['converted']

In [267]:
n_old = old_results.count()
n_new = new_results.count()
nobs = [n_old, n_new]

In [234]:
bought = old_results.sum(), new_results.sum()
bought

(526, 549)

In [261]:
# so 526 for old page group and 549 for new page group, both out of 4433

In [273]:
z_stat, pval = proportions_ztest(bought,nobs=nobs)
(lower_old, lower_new),(upper_old, upper_new) = proportion_confint(bought, nobs=nobs, alpha=0.05)

In [285]:
print(f"The Z statistic of the research is: {z_stat}")

The Z statistic of the research is: -0.7483260525413273


In [287]:
print(f"The p-value of the research is: {pval}")

The p-value of the research is: 0.45426351388883957


In [289]:
print(f"The 95% confidence interval of the old page is: {lower_old, upper_old}")

The 95% confidence interval of the old page is: (0.10913599562771406, 0.12817508039303938)


In [291]:
print(f"The 95% confidence interval of the new page is: {lower_new, upper_new}")

The 95% confidence interval of the new page is: (0.11414712396675486, 0.133540672108138)


In [None]:
# interpretation and conclusion

In [None]:
# p-value of 0.454 is very high and it is greater than 0.05 meaning that
# we fail to reject the null hypothesis
# which means that:
# there is no difference between new design and old design impact on conversion rate
# and in fact we should not invest in creating the new design