This project inspired from an article called "Pengertian, Cara Kerja, dan Penerapan A/B Testing" (Indonesian).

Here is the link to the article: https://softscients.com/2022/01/14/pengertian-cara-kerja-dan-penerapan-a-b-testing/

## Problem Statement

...

In [29]:
# Import Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm

In [8]:
# Import the data
df = pd.read_csv("Hotel Site Visit_csv.csv", decimal=",")
df

Unnamed: 0,variasi,menginap,hari,pendapatan
0,A,TIDAK,0.0,000
1,A,TIDAK,0.0,000
2,A,TIDAK,0.0,000
3,A,TIDAK,0.0,000
4,A,TIDAK,0.0,000
...,...,...,...,...
1446,B,YA,4.0,"22.281,00"
1447,B,TIDAK,0.0,000
1448,B,TIDAK,0.0,000
1449,B,TIDAK,0.0,000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1451 entries, 0 to 1450
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   variasi     1451 non-null   object 
 1   menginap    1451 non-null   object 
 2   hari        1451 non-null   float64
 3   pendapatan  1451 non-null   object 
dtypes: float64(1), object(3)
memory usage: 45.5+ KB


In [9]:
df = df[["variasi", "menginap"]]
df.head()

Unnamed: 0,variasi,menginap
0,A,TIDAK
1,A,TIDAK
2,A,TIDAK
3,A,TIDAK
4,A,TIDAK


In [11]:
df["variasi"].count()

1451

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1451 entries, 0 to 1450
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   variasi   1451 non-null   object
 1   menginap  1451 non-null   object
dtypes: object(2)
memory usage: 22.8+ KB


In [23]:
df[(df["variasi"] == "A") & (df["menginap"] == "YA")].count()[0]

20

In [24]:
def conversion_rate(variant):
    conversion = df[(df["variasi"] == variant) & (df["menginap"] == "YA")].count()[0]
    visitor = df[df["variasi"] == variant].count()[0]
    rate = conversion/visitor
    return {
        "conversion": conversion,
        "visitor": visitor,
        "rate": rate
    }

In [25]:
variant_A = conversion_rate("A")

In [26]:
variant_A

{'conversion': 20, 'visitor': 721, 'rate': 0.027739251040221916}

In [27]:
variant_B = conversion_rate("B")

In [28]:
variant_B

{'conversion': 37, 'visitor': 730, 'rate': 0.050684931506849315}

In [31]:
# Significance level 5%
pool = (variant_A["conversion"] + variant_B["conversion"]) / (variant_A["visitor"] + variant_B["visitor"])
se_pool = np.sqrt(pool * (1 - pool) * ((1 / variant_A["visitor"]) + (1 / variant_B["visitor"])))
margin_err = se_pool * norm.ppf(0.975)
diff_proportion = variant_B["rate"] - variant_A["rate"]
increased = ((variant_B["rate"] / variant_A["rate"]) - 1) * 100
z_score = diff_proportion / se_pool
pvalue = norm.cdf(-z_score)

In [33]:
result = pd.DataFrame({
    "metric": ["Estimated Difference", "Relative Uplift (%)", "Pooled Sample Proportion", "Standard Error of Difference", "Z-score", "P-value", "Margin of Error"],
    "value": [diff_proportion, increased, pool, se_pool, z_score, pvalue, margin_err]
})
result

Unnamed: 0,metric,value
0,Estimated Difference,0.022946
1,Relative Uplift (%),82.719178
2,Pooled Sample Proportion,0.039283
3,Standard Error of Difference,0.0102
4,Z-score,2.249546
5,P-value,0.012239
6,Margin of Error,0.019992
