In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, chi2 #Chi-square and p-value
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
campaign_data = pd.read_excel("grocery_database.xlsx", sheet_name="campaign_data")

In [3]:
campaign_data

Unnamed: 0,customer_id,campaign_name,campaign_date,mailer_type,signup_flag
0,74,delivery_club,2020-07-01,Mailer1,1
1,524,delivery_club,2020-07-01,Mailer1,1
2,607,delivery_club,2020-07-01,Mailer2,1
3,343,delivery_club,2020-07-01,Mailer1,0
4,322,delivery_club,2020-07-01,Mailer2,1
...,...,...,...,...,...
865,372,delivery_club,2020-07-01,Mailer2,1
866,104,delivery_club,2020-07-01,Mailer1,1
867,393,delivery_club,2020-07-01,Mailer2,1
868,373,delivery_club,2020-07-01,Control,0


In [4]:
#Filter data
#We only want to see data that doesn't equal control
campaign_data = campaign_data.loc[campaign_data["mailer_type"] != "Control"]

In [5]:
#We have now dropped all control groups here
campaign_data

Unnamed: 0,customer_id,campaign_name,campaign_date,mailer_type,signup_flag
0,74,delivery_club,2020-07-01,Mailer1,1
1,524,delivery_club,2020-07-01,Mailer1,1
2,607,delivery_club,2020-07-01,Mailer2,1
3,343,delivery_club,2020-07-01,Mailer1,0
4,322,delivery_club,2020-07-01,Mailer2,1
...,...,...,...,...,...
863,765,delivery_club,2020-07-01,Mailer2,1
864,466,delivery_club,2020-07-01,Mailer1,1
865,372,delivery_club,2020-07-01,Mailer2,1
866,104,delivery_club,2020-07-01,Mailer1,1


In [6]:
#Cross tabs
observed_values = pd.crosstab(campaign_data["mailer_type"], campaign_data["signup_flag"]).values
#Pass two columns of interest, mailer_type and signup_flag

In [7]:
observed_values
#We have an array that is 2x2 this is what we want

array([[252, 123],
       [209, 127]], dtype=int64)

In [8]:
mailer1_signup_rate = 123/(252+123) #on first row, 123/252
mailer2_signup_rate = 127/(127+209)

print(mailer1_signup_rate, mailer2_signup_rate)


0.328 0.37797619047619047


In [10]:
#We can see that mailer 2 seems to have a higher conversion rate - 33% vs. 38% (rounded up)
#State hypotheses and set acceptance criteria

null_hypothesis = "There is no relationship between mailer type and sign up rate. They are independent."

alternate_hypothesis = "There is a relationship between mailer type and signup rate. They are not independent."

acceptance_criteria = 0.05

In [11]:
#Calculate expected frequencies & chi-square statistic

chi2_statistic, p_value, dof, expected_values = chi2_contingency(observed_values, correction = False) #False is Yates correction - if DF are equal to 1 with 2x2 matrix, we want it false
print(chi2_statistic, p_value)

1.9414468614812481 0.16351152223398197


In [12]:
#Chi-square is 1.94, p-value is 0.16 - we accept the null hypothesis. No significant difference here.

In [13]:
#Critical value for our test
#dof = degrees of freedom
critical_value = chi2.ppf(1 - acceptance_criteria, dof) #Percentage point function - find critical value along chi-square based on acceptance criterian
print(critical_value)

3.841458820694124


In [14]:
#We need a chi-square of 3.84 to meet our acceptance value of 0.05.

In [17]:
if chi2_statistic >= critical_value:
    print(f"As our chi-square statistic of {chi2_statistic} is higher than our critical value of {critical_value} - we reject the null hypothesis, and conclude that: {alternate_hypothesis}")
else:
    print(f"As our chi-square statistic of {chi2_statistic} is lower than our critical value of {critical_value} - we accept the null hypothesis, and conclude that: {null_hypothesis}")


As our chi-square statistic of 1.9414468614812481 is lower than our critical value of 3.841458820694124 - we accept the null hypothesis, and conclude that: There is no relationship between mailer type and sign up rate. They are independent.


In [19]:
#Print the results
if p_value <= acceptance_criteria:
    print(f"As our p-value of {p_value} is lower than our acceptance criteria of {acceptance_criteria} - we reject the null hypothesis, and conclude that: {alternate_hypothesis}")
else:
    print(f"As our p-value of {p_value} is higher than our acceptance criteria {acceptance_criteria} - we accept the null hypothesis, and conclude that: {null_hypothesis}")


As our p-value of 0.16351152223398197 is higher than our acceptance criteria 0.05 - we accept the null hypothesis, and conclude that: There is no relationship between mailer type and sign up rate. They are independent.


In [20]:
#Perhaps ABC Grocery should run further tests, or conclude that they should look into the cheaper mailer (mailer 1) over the expensive mailer (mailer 2)
#We should be careful of making any hard conclusions here.