In [1]:
import numpy as np
import pandas as pd


# z-test

### one-sample proportion z-test (right-tailed)

#### 1. Define the hypotheses

The late_shipments dataset contains supply chain data on the delivery of medical supplies. Each row represents one delivery of a part. The late columns denotes whether or not the part was delivered late. A value of `Yes` means that the part was delivered late, and a value of `No` means the part was delivered on time.

- Null hypothesis (H₀): The true proportion of late shipments is 6%.
- Alternative hypothesis (H₁): The true proportion of late shipments is greater than 6%.

#### 2. Collect sample data

In [2]:
df = pd.read_feather("data/late_shipments.feather")


In [3]:
df.describe()


Unnamed: 0,id,late_delivery,unit_of_measure_per_pack,line_item_quantity,line_item_value,pack_price,unit_price,weight_kilograms,freight_cost_usd,line_item_insurance_usd
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,986.0,966.0
mean,39239.957,0.061,78.675,15923.379,154227.2,36.57083,1.09405,1947.682,10896.676856,245.143209
std,24878.296938,0.23945,91.124569,37793.936877,216051.7,47.611183,3.356263,3125.348148,14267.588869,357.305275
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,30.0,0.0
25%,17055.0,0.0,30.0,600.0,11811.6,6.4975,0.11,162.75,1946.4975,16.845
50%,38283.5,0.0,60.0,2992.0,63600.0,21.0,0.38,909.0,5806.75,100.7
75%,62036.5,0.0,100.0,11733.0,217826.2,70.0,0.89,2526.0,14821.7925,330.0
max,82105.0,1.0,1000.0,515000.0,2458454.0,400.0,24.5,38681.0,161962.32,3446.75


In [4]:
df.head()


Unnamed: 0,id,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,late_delivery,late,product_group,sub_classification,...,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,freight_cost_groups,line_item_insurance_usd
0,36203.0,Nigeria,PMO - US,Direct Drop,EXW,Air,1.0,Yes,HRDT,HIV test,...,2996.0,266644.0,89.0,0.89,"Alere Medical Co., Ltd.",Yes,1426.0,33279.83,expensive,373.83
1,30998.0,Botswana,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,25.0,800.0,32.0,1.6,"Trinity Biotech, Plc",Yes,10.0,559.89,reasonable,1.72
2,69871.0,Vietnam,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,...,22925.0,110040.0,4.8,0.08,Hetero Unit III Hyderabad IN,Yes,3723.0,19056.13,expensive,181.57
3,17648.0,South Africa,PMO - US,Direct Drop,DDP,Ocean,0.0,No,ARV,Adult,...,152535.0,361507.95,2.37,0.04,"Aurobindo Unit III, India",Yes,7698.0,11372.23,expensive,779.41
4,5647.0,Uganda,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test - Ancillary,...,850.0,8.5,0.01,0.0,Inverness Japan,Yes,56.0,360.0,reasonable,0.01


#### 3. Compute standard error under H₀

$$
\hat{p} = \frac{\text{successes}}{n}
$$

In [5]:
# compute the sample proportion of late shipments
(df["late"]=="Yes").mean()


0.061

$$
SE = \sqrt{\frac{\hat{p} (1 - \hat{p})}{n}}
$$

In [6]:
# compute the standard error of the sample proportion; it measures the variability of the sample proportion
p_hat = (df["late"]=="Yes").mean()
n = len(df)
se = np.sqrt(p_hat * (1 - p_hat) / n) # standard error formula for a proportion
print(se)


0.0075682891065286355


#### 4. Compute z-score

$$
z = \frac{\hat{p} - p_0}{SE}
$$

In [7]:
# compute z-score; it measures how many standard errors the sample proportion is away from the hypothesized population proportion
p_0 = 0.06 # hypothesized population proportion
z = (p_hat - p_0) / se
print(z)


0.132130259022131


#### 5. Compute p-value

$$
\text{p-value} = P(\text{test statistic at least as extreme as observed} \mid H_0)
$$


$$
\text{p-value} = P(Z > z_{\text{obs}})
$$


In [8]:
# compute p-value; it measures the probability of observing a sample proportion as extreme as the one observed, assuming the null hypothesis is true
from scipy import stats
p_value = 1 - stats.norm.cdf(z) # right-tailed test
print(p_value)


0.44744063124605615


#### 6. Make hypothesis decision

In [9]:
# conclusion
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: there is evidence that the proportion of late shipments is greater than 6%.")
else:
    print("Fail to reject the null hypothesis: there is not enough evidence that the proportion of late shipments is greater than 6%.")


Fail to reject the null hypothesis: there is not enough evidence that the proportion of late shipments is greater than 6%.


#### 7. Compute confidence interval

$$
\hat{p} \pm z \cdot SE
$$

In [10]:
# compute 95% confidence interval for the proportion of late shipments
# z_critical = stats.norm.ppf(0.975) # two-tailed 95% confidence interval; split alpha/2 on each tail
# z_critical = stats.norm.ppf(0.05) # left-tailed 95% confidence interval; alpha = 0.05 on the left tail
z_critical = stats.norm.ppf(0.95) # right-tailed 95% confidence interval; alpha = 0.05 on the right tail

margin_of_error = z_critical * se
ci_lower = p_hat - margin_of_error
ci_upper = p_hat + margin_of_error
print(f"95% Confidence Interval: ({ci_lower:.4f}, {ci_upper:.4f})")

# When we compute a metric from data (like a conversion rate), the result varies due to random sampling.
# Instead of reporting a single number, a confidence interval gives a range: an interval within which we expect the true population parameter to lie, with a certain level of confidence (e.g., 95%).


95% Confidence Interval: (0.0486, 0.0734)


### two-sample proportion z-test

You may wonder if the amount paid for freight affects whether or not the shipment was late. Recall that in the `late_shipments` dataset, whether or not the shipment was late is stored in the `late` column. Freight costs are stored in the `freight_cost_group` column, and the categories are `expensive` and `reasonable`.

#### 1. Define hypotheses

$$
H_0: p_{\text{expensive}} - p_{\text{reasonable}} = 0
$$

$$
H_A: p_{\text{expensive}} - p_{\text{reasonable}} > 0
$$


#### 2. Split data into groups

In [11]:
expensive = df[df["freight_cost_groups"] == "expensive"]
reasonable = df[df["freight_cost_groups"] == "reasonable"]


In [12]:
df["freight_cost_groups"].value_counts()


freight_cost_groups
expensive     531
reasonable    455
Name: count, dtype: int64

#### 3. Compute sample proportions

$$
\hat{p}_1 = \frac{x_1}{n_1}, \quad
\hat{p}_2 = \frac{x_2}{n_2}
$$


In [13]:
p1_hat = (expensive["late"]=="Yes").mean()
p2_hat = (reasonable["late"]=="Yes").mean()
diff = p1_hat - p2_hat
print(f"Difference in sample proportions (expensive - reasonable): {diff:.4f}")


Difference in sample proportions (expensive - reasonable): 0.0439


#### 4. Compute pooled proportion

Under H₀ we assume both groups share one rate.

$$
\hat{p} =
\frac{x_1 + x_2}{n_1 + n_2}
$$


In [14]:
pooled_hat = ( (expensive["late"]=="Yes").sum() + (reasonable["late"]=="Yes").sum() ) / (len(expensive) + len(reasonable))


#### 5. Compute standard error

$$
SE =
\sqrt{
\hat{p}(1-\hat{p})
\left(
\frac{1}{n_1} + \frac{1}{n_2}
\right)
}
$$


In [15]:
se_diff = np.sqrt( pooled_hat * (1 - pooled_hat) * (1/len(expensive) + 1/len(reasonable)) )


#### 6. Compute z-score

$$
z =
\frac{\hat{p}_1 - \hat{p}_2}{SE}
$$


In [16]:
z_diff = diff / se_diff
print(f"Z-score for difference in proportions: {z_diff:.4f}")


Z-score for difference in proportions: 2.9226


In [17]:
df.head()


Unnamed: 0,id,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,late_delivery,late,product_group,sub_classification,...,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,freight_cost_groups,line_item_insurance_usd
0,36203.0,Nigeria,PMO - US,Direct Drop,EXW,Air,1.0,Yes,HRDT,HIV test,...,2996.0,266644.0,89.0,0.89,"Alere Medical Co., Ltd.",Yes,1426.0,33279.83,expensive,373.83
1,30998.0,Botswana,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,25.0,800.0,32.0,1.6,"Trinity Biotech, Plc",Yes,10.0,559.89,reasonable,1.72
2,69871.0,Vietnam,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,...,22925.0,110040.0,4.8,0.08,Hetero Unit III Hyderabad IN,Yes,3723.0,19056.13,expensive,181.57
3,17648.0,South Africa,PMO - US,Direct Drop,DDP,Ocean,0.0,No,ARV,Adult,...,152535.0,361507.95,2.37,0.04,"Aurobindo Unit III, India",Yes,7698.0,11372.23,expensive,779.41
4,5647.0,Uganda,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test - Ancillary,...,850.0,8.5,0.01,0.0,Inverness Japan,Yes,56.0,360.0,reasonable,0.01


In [18]:
df.product_group.value_counts()



product_group
ARV     583
HRDT    409
ANTM      5
ACT       3
Name: count, dtype: int64

$$
MDE =
(z_{1-\alpha/2} + z_{power})
\sqrt{\frac{2p(1-p)}{n}}
$$
