In [2]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') 
random.seed(42)

# Part 1: Statistics 

**Loading Data**

In [31]:
df=pd.read_csv('/kaggle/input/ab-test/ab_data.csv')

In [32]:
df.shape

(294478, 5)

In [33]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [34]:
# Getting information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [35]:
#show unique values
df.nunique()

user_id         290584
timestamp       294478
group                2
landing_page         2
converted            2
dtype: int64

In [36]:
# The proportion of users converted in the data
df['converted'].value_counts()[1] / (df['converted'].value_counts()[0] + df['converted'].value_counts()[1])

0.11965919355605512

In [37]:
# The number of times the "new_page" and "treatment" don't line up.
df.query("group == 'treatment' and landing_page != 'new_page'").count()[0] + \
df.query("group != 'treatment' and landing_page == 'new_page'").count()[0]

3893

In [38]:
# Regrouping  with new variables in the dataset

df.drop(df.query("group == 'treatment' and landing_page != 'new_page'").index, inplace = True)
df.drop(df.query("group != 'treatment' and landing_page == 'new_page'").index, inplace = True)
df2 = df
df2.shape

(290585, 5)

In [39]:
# Check if all of the correct rows are removed - that should be 0

df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0

In [40]:
# Check again for unique values
df2.nunique()

user_id         290584
timestamp       290585
group                2
landing_page         2
converted            2
dtype: int64

In [45]:
# Find out a duplicated from "user_id" column
duplicate=df2[df2['user_id'].duplicated()]
duplicate

Unnamed: 0,user_id,timestamp,group,landing_page,converted
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [47]:
# Remove the duplicated
df2.drop(duplicate.index, inplace = True)

In [48]:
# The probability converted in the control group
control = df2.query("group == 'control' and converted == 1").count()[4] / df2[df2['group'] == 'control'].count()[4]
control

0.1203863045004612

In [49]:
# The probability converted in the treatment group
treatment = df2.query("group == 'treatment' and converted == 1").count()[4] / df2[df2['group'] == 'treatment'].count()[4]
treatment

0.11880806551510564

In [50]:
#The probability received the new page

df2.query("landing_page == 'new_page'").count()[3] / df2['landing_page'].count()

0.5000619442226688

# Part 2 : A/B Testing

In [51]:
# P_new under the null

p_new = df2.query("converted == 1").count()[4]/ df2["converted"].count()
p_new

0.11959708724499628

In [52]:
# P_old under the null

p_old = df2.query("converted == 1").count()[4]/ df2["converted"].count()
p_old

0.11959708724499628

In [53]:
n_new = df2.query('group == "treatment"').count()[0]
n_new

145310

In [54]:
n_old = df2.query('group == "control"').count()[0]
n_old

145274

In [55]:
p_dif = p_new - p_old
p_dif

0.0

In [56]:
# Store 1, 0 in new_page_converted 
new_page_converted = np.random.choice([0,1], n_new, p=(p_new, 1-p_new))
new_page_converted

array([1, 1, 1, ..., 1, 1, 1])

In [57]:
# Store 1, 0 in old_page_converted 
old_page_converted = np.random.choice([0,1], n_old, p=(p_old, 1-p_old))
old_page_converted

array([1, 1, 1, ..., 1, 1, 1])

In [58]:
new_page_converted.mean() - old_page_converted.mean()

0.0024867355060183005

In [59]:
# Apply A/B testing with statistics package

import statsmodels.api as sm

convert_old = old_page_converted.mean()
convert_new = new_page_converted.mean()

n_old = n_old
n_new = n_new

In [60]:
z_score, p_value = sm.stats.proportions_ztest(np.array([convert_new,convert_old]),np.array([n_new,n_old]), alternative = 'larger')
z_score, p_value

(0.0017095419631798037, 0.4993179917629658)

In [61]:
from scipy.stats import norm

# Telling how significant the z-score is
norm.cdf(z_score)

0.5006820082370342

In [62]:
# Tell what the critical value at 95% confidence is

norm.ppf(1-(0.05/2))

1.959963984540054

# Part 3 : A regression approach

In [63]:
# Logistic Regression approach

df2['intercept'] = 1
df2 = df2.join(pd.get_dummies(df2['landing_page']))
df2['ab_page'] = pd.get_dummies(df2['group']) ['treatment']

In [64]:
df2.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted,intercept,new_page,old_page,ab_page
0,851104,2017-01-21 22:11:48.556739,control,old_page,0,1,False,True,False
1,804228,2017-01-12 08:01:45.159739,control,old_page,0,1,False,True,False
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0,1,True,False,True
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0,1,True,False,True
4,864975,2017-01-21 01:52:26.210827,control,old_page,1,1,False,True,False


In [65]:
print(df2.dtypes)

user_id          int64
timestamp       object
group           object
landing_page    object
converted        int64
intercept        int64
new_page          bool
old_page          bool
ab_page           bool
dtype: object


In [66]:
df2['converted'] = pd.to_numeric(df2['converted'])
df2['intercept'] = pd.to_numeric(df2['intercept'])
df2['ab_page'] = pd.to_numeric(df2['ab_page'])

In [67]:
df2.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
intercept       0
new_page        0
old_page        0
ab_page         0
dtype: int64

In [68]:
 np.asarray(df2)

array([[851104, '2017-01-21 22:11:48.556739', 'control', ..., False,
        True, False],
       [804228, '2017-01-12 08:01:45.159739', 'control', ..., False,
        True, False],
       [661590, '2017-01-11 16:55:06.154213', 'treatment', ..., True,
        False, True],
       ...,
       [734608, '2017-01-22 11:45:03.439544', 'control', ..., False,
        True, False],
       [697314, '2017-01-15 01:20:28.957438', 'control', ..., False,
        True, False],
       [715931, '2017-01-16 12:40:24.467417', 'treatment', ..., True,
        False, True]], dtype=object)

In [69]:
df2['ab_page'] = df2['ab_page'].astype(int)  # Convert to integer

In [70]:
logit = sm.Logit(df2['converted'],df2[['intercept','ab_page']])

In [71]:
# Summary: Logistic Regression model  

results = logit.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.366118
         Iterations 6


0,1,2,3
Dep. Variable:,converted,No. Observations:,290584.0
Model:,Logit,Df Residuals:,290582.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 28 Mar 2024",Pseudo R-squ.:,8.077e-06
Time:,04:17:26,Log-Likelihood:,-106390.0
converged:,True,LL-Null:,-106390.0
Covariance Type:,nonrobust,LLR p-value:,0.1899

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-1.9888,0.008,-246.669,0.000,-2.005,-1.973
ab_page,-0.0150,0.011,-1.311,0.190,-0.037,0.007
