### Imports

In [1]:
import numpy as np
from scipy import stats
import pandas as pd
from scipy import misc
import matplotlib.pyplot as plt

### Read Data

In [2]:
data_path = '../advertisement_clicks.csv'
df = pd.read_csv(data_path)

In [3]:
df.head()

Unnamed: 0,advertisement_id,action
0,B,1
1,B,1
2,A,0
3,B,0
4,A,1


### Explore Data

In [4]:
# Size of sample
len(df)

2000

In [5]:
# Size of each dataset
df['advertisement_id'].value_counts()

B    1000
A    1000
Name: advertisement_id, dtype: int64

In [6]:
# Size of each dataset
df['action'].value_counts()

0    1324
1     676
Name: action, dtype: int64

In [7]:
a = list(df[df['advertisement_id'] == 'A']['action'])

In [8]:
b = list(df[df['advertisement_id'] == 'B']['action'])

In [9]:
N = len(a)

In [10]:
N == len(b)

True

### SciPy Methods

The main public methods for continuous RVs are:

- rvs: Random Variates

- pdf: Probability Density Function

- cdf: Cumulative Distribution Function

- sf: Survival Function (1-CDF)

- ppf: Percent Point Function (Inverse of CDF)

- isf: Inverse Survival Function (Inverse of SF)

- stats: Return mean, variance, (Fisher’s) skew, or (Fisher’s) kurtosis

- moment: non-central moments of the distribution

### Manual Calcs

In [11]:
manual_mean_a = sum(a)/N

In [12]:
manual_mean_b = sum(b)/N

In [13]:
manual_var_a = 1/(N-1)*sum([(i - manual_mean_a)**2 for i in a])

In [14]:
manual_var_b = 1/(N-1)*sum([(i - manual_mean_b)**2 for i in b])

In [15]:
manual_var_a

0.21179579579579275

In [16]:
manual_var_b

0.23384984984985163

In [17]:
# Pooled stdev
manual_s = np.sqrt((manual_var_a + manual_var_b) / 2)

In [18]:
# T stat
manual_t = (manual_mean_a - manual_mean_b) / (manual_s * np.sqrt(2.0/N))

In [19]:
# Degrees of freedom
df = 2*N - 2

In [20]:
# P Value
manual_p = 1 - stats.t.cdf(manual_t, df=df)

In [21]:
print("t: {}, p: {}".format(manual_t, 2*manual_p))

t: -3.221173213801983, p: 1.9987028094532875


### Method 2: np Manual Calcs

In [25]:
var_a = np.var(a, ddof=1)
var_b = np.var(b, ddof=1)

In [29]:
# Variance a
var_a

0.2117957957957958

In [33]:
# Variance b
var_b

0.23384984984984983

In [34]:
# Pooled st dev
s = np.sqrt((var_a + var_b)/2)

In [36]:
# T stat
t = (np.mean(a) - np.mean(b)) / (s*np.sqrt(2.0/N))

In [37]:
t

-3.221173213801978

In [38]:
p = 1 - stats.t.cdf(t, df=df)

In [41]:
p

0.9993514047266437

### Builtin Calcs

In [42]:
t2, p2 = stats.ttest_ind(a, b)

In [43]:
print("t2: {}, p2: {}".format(t2, p2))

t2: -3.2211732138019786, p2: 0.0012971905467125246


### LP Solution

In [133]:
lp_df = pd.read_csv(data_path)

In [134]:
a = lp_df[lp_df['advertisement_id'] == 'A']
b = lp_df[lp_df['advertisement_id'] == 'B']

In [135]:
a = a['action']
b = b['action']

In [136]:
lp_t, lp_p = stats.ttest_ind(a, b)

In [137]:
print("lp_t: {}, lp_p: {}".format(lp_t, lp_p))

lp_t: -3.2211732138019786, lp_p: 0.0012971905467125246


In [138]:
# Welchs t test
lp_w_t, lp_w_p = stats.ttest_ind(a, b, equal_var=False)

In [139]:
print("lp_w_t: {}, lp_W_p: {}".format(lp_w_t, lp_w_p))

lp_w_t: -3.2211732138019786, lp_W_p: 0.0012972410374001632
