# Hypothesis Testing

In [28]:
import numpy as np

np.random.seed(42)

In [29]:
# Simulate sales history of 2 different stores

days = 365

mean_A = 20
std_A = 5

mean_B = 19.5
std_B = 5


shop_A_sales = np.random.normal(mean_A, std_A, days)
shop_B_sales = np.random.normal(mean_B, std_B, days)

In [30]:
# Set alpha (the probability of rejecting the null hypothesis when it's actually true)
alpha = 0.05

shop_A_mean = shop_A_sales.mean()
print(shop_A_mean)
shop_B_mean = shop_B_sales.mean()
print(shop_B_mean)

observed_means_difference = shop_A_mean - shop_B_mean
print(observed_means_difference)

20.04973201106029
19.309929401404304
0.7398026096559853


Combine the sales data from both stores into 1 vector

Mix up the data into a different, random order

Assign the first half of the new vector as the sales of 1 store
and assign the second half of the new vector as the sales of the 2nd store

Calculate the means of both and find the observed means difference

Compare this result to the previous result

In [31]:
both_sales = np.concatenate((shop_A_sales, shop_B_sales))

sales_permutation_both = np.random.permutation(both_sales)

sales_permutation_A = sales_permutation_both[:len(shop_A_sales)]
sales_permutation_B = sales_permutation_both[len(shop_A_sales):]

observed_means_difference_permutation = sales_permutation_A.mean() - sales_permutation_B.mean()

In [32]:
print(observed_means_difference)
print(observed_means_difference_permutation)

0.7398026096559853
0.21098789154327235


There is a difference between the 1st and 2nd results, but we only did 1 trial so that could just be luck.

Let's do 1000 trials and store the result of each trial in a list

In [33]:
permutation_replication_means = []

for i in range(1000):
    sales_permutation_replication = np.random.permutation(both_sales)
    sales_permutation_A = sales_permutation_replication[:len(shop_A_sales)]
    sales_permutation_B = sales_permutation_replication[len(shop_A_sales):]
    observed_means_difference_replication = sales_permutation_A.mean() - sales_permutation_B.mean()
    permutation_replication_means.append(observed_means_difference_replication)

Compute P-Value

The P-value is the chance that you would observe a result as extreme or more extreme than the result you got given that the null hypothesis is true.

i.e. The P-Value is the chance that the difference in result was due to "dumb luck"

In [51]:
# This calculates P-value. Explained in the box below
p_value = np.sum(np.abs(permutation_replication_means) >= observed_means_difference) / len(permutation_replication_means)

# print the result
print('p-value =', p_value)

p-value = 0.043


In [50]:
# This takes the absolute value value of all the means in our simulation and counts the ones that are rarer than the one
# from our actual data
# The >= converts the list to True or False based on if the value is rarer or not
# The np.sum counts 1 for every 'True'
# Then the number of rare results by the total number of results

np.sum(np.abs(permutation_replication_means) >= observed_means_difference) / 1000

0.043