## A/B Test
### Dataset
https://www.kaggle.com/datasets/zhangluyuan/ab-testing?select=ab_data.csv


## Import

In [45]:
import pandas as pd
import numpy as np
import math 
from statsmodels.stats import api as sms
from scipy.stats import chi2_contingency
import lux
from lux.vis.VisList import VisList

## Loading

In [46]:
df_raw=pd.read_csv('C:/Users/Utente77/repos/AB_Test/dataset/ab_data.csv')

In [47]:
df_raw.shape

(294478, 5)

In [48]:
df_raw.head()

Lux detects that the attribute 'timestamp' may be temporal.
To display visualizations for these attributes accurately, please convert temporal attributes to Datetime objects.
For example, you can convert a Year attribute (e.g., 1998, 1971, 1982) using pd.to_datetime by specifying the `format` as '%Y'.

Here is a starter template that you can use for converting the temporal fields:
	df['timestamp'] = pd.to_datetime(df['timestamp'], format='<replace-with-datetime-format>')

See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
If timestamp is not a temporal attribute, please use override Lux's automatically detected type:
	df.set_data_type({'timestamp':'quantitative'})
Lux detects that the attribute 'timestamp' may be temporal.
To display visualizations for these attributes accurately, please convert temporal attributes to Datetime objects.
For example, you can convert a Year attribute (e.g., 1998, 1971, 1982) using pd.to_datetime by specifying t

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [49]:
VisList(["converted","landing_page",],df_raw)

LuxWidget(recommendations=[{'action': 'Vis List', 'description': 'Shows a vis list defined by the intent', 'vs…

## DOE

### Hypothesis

In [50]:
# H0: A conversao da nova pagina è de 13%
# H1: A conversao è diferente de 13%

### Parametres

In [67]:
## Confidence_level:
confidence_level = 0.95
    
## Significance_level:
significance_level= 0.05

## webpage conversion
p1=0.13
p2=0.15

                   
## Effect Size
effect_size=sms.proportion_effectsize(p1,p2)

## Statistic Power
statistic_power=0.8


### Sampling - Simple random

In [52]:
## sample size
sample_n=sms.NormalIndPower().solve_power( ## metade dos dados
    effect_size,
    power=statistic_power,
    alpha=significance_level
    )

sample_n=math.ceil(sample_n)

sample_n ## controle

4720

In [53]:
sample_total= 2*sample_n # samplig de p1 e p2
sample_total

9440

In [54]:
#sabendo que temos 10% de abertura mail para obter a amostra necessaria precisamos enviar 110% a mais
n_invio_mail=sample_n/p
n_invio_mail

47200.0

## Data prepare

In [55]:
df_aux=df_raw[['user_id','group']].groupby('user_id').count().reset_index().query('group >1')
df3=df_raw[~df_raw['user_id'].isin(df_aux['user_id'])]

## Sampling

In [56]:
df_control_sample=df3[df3['group']=='control'].sample(n=sample_n, random_state=32)
print('Size of control Group:{}'.format(df_control_sample.shape[0]))

Size of control Group:4720


In [57]:
df_treatment_sample=df3[df3['group']=='treatment'].sample(n=sample_n, random_state=32)
print('Size of treatment Group:{}'.format(df_treatment_sample.shape[0]))

Size of treatment Group:4720


## Conversion Tax 

In [58]:
converted=df_control_sample.loc[df_control_sample['converted']==1,'converted'].sum()
conversion_rate_control=converted/len(df_control_sample['converted'])
print('Conversion Rate - Control Group:{}'.format(conversion_rate_control))

converted=df_treatment_sample.loc[df_treatment_sample['converted']==1,'converted'].sum()
conversion_rate_treatment=converted/len(df_treatment_sample['converted'])
print('Conversion Rate - Treatment Group:{}'.format(conversion_rate_treatment))

df_ab=pd.concat([df_control_sample,df_treatment_sample])


Conversion Rate - Control Group:0.11864406779661017
Conversion Rate - Treatment Group:0.11970338983050847


## Hypothesis test

In [59]:
df_table=df_ab[['group','converted']].groupby('group').agg({'converted':['sum','count']})
df_table.columns=['converted','non_converted']#so pra melhorar a disposiçao da tabela
chi_val, pval, dof, expected = chi2_contingency(df_table)
print('p-value:{:.2f}:'.format(pval))
if pval < significance_level:
    print('Rejeita hipotese nula')
else:
    print('Falha em rejeitar a hipotese nula' )
df_table

p-value:0.91:
Falha em rejeitar a hipotese nula


Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

## Results conversion $

In [60]:
# current page =13%
# new page = 15%

# buyers= n_visits daily * % current page 
# GMV=buyers*tiket (4500)

In [61]:
df4=df3.copy()

In [62]:
df4['timestamp']=pd.to_datetime(df4['timestamp']).apply(lambda x: x.strftime('%Y-%m-%d'))
df5=df4[['user_id','timestamp']].groupby('timestamp').count().reset_index()

In [63]:
# Current GMV
df5['current_purchase']=np.ceil(df5['user_id']*0.13).astype(int)
df5['current_GMV']=df5['current_purchase']*4500
current_gmv=df5['current_GMV'].sum()


In [64]:
# New GMV
df5['new_purchase']=np.ceil(df5['user_id']*0.15).astype(int)
df5['new_GMV']=df5['new_purchase']*4500
new_GMV=df5['new_GMV'].sum()


In [65]:
lift_abs=new_GMV-current_gmv
lift=100*(new_GMV-current_gmv)/current_gmv

In [66]:
print('GMV on period:{}'.format(current_gmv))
print('New GMV on period:{}'.format(new_GMV))
print('Abs Lift: {}'.format(lift_abs))
print('Expected Lift:{:.2f}%'.format(lift))


GMV on period:167760000
New GMV on period:193563000
Abs Lift: 25803000
Expected Lift:15.38%
