# Chapter 12 Analysis of Single Factor Experiments

In [1]:
import json

import polars as pl
from polars import col, lit
from scipy import stats
import numpy as np
import altair as alt

# random number generator
RNG = np.random.default_rng()

## Section 12.1

### 1.

In [2]:
with open('Ex12-1.json') as file:
    data = pl.from_dict(json.load(file))
print(data)

shape: (3, 6)
┌──────┬──────────┬─────────┬──────────┬─────────┬──────────┐
│ n    ┆ SugarAvg ┆ SugarSD ┆ FiberAvg ┆ FiberSD ┆ Location │
│ ---  ┆ ---      ┆ ---     ┆ ---      ┆ ---     ┆ ---      │
│ f64  ┆ f64      ┆ f64     ┆ f64      ┆ f64     ┆ str      │
╞══════╪══════════╪═════════╪══════════╪═════════╪══════════╡
│ 20.0 ┆ 4.8      ┆ 2.138   ┆ 1.68     ┆ 1.166   ┆ Shelf1   │
│ 20.0 ┆ 9.85     ┆ 1.985   ┆ 0.95     ┆ 1.162   ┆ Shelf2   │
│ 20.0 ┆ 6.1      ┆ 1.865   ┆ 2.17     ┆ 1.277   ┆ Shelf3   │
└──────┴──────────┴─────────┴──────────┴─────────┴──────────┘


Transform to a long format that is a bit easier to work with.

In [3]:
data = (
    data
    .select(
        factor='Location',
        n=col('n').cast(int),
        container=pl.concat_list(
            pl.struct(dataset=lit('sugar'), μ='SugarAvg', s='SugarSD'),
            pl.struct(dataset=lit('fiber'), μ='FiberAvg', s='FiberSD')))
    .explode('container')
    .unnest('container')
    .sort('dataset', 'factor'))
print(data)

shape: (6, 5)
┌────────┬─────┬─────────┬──────┬───────┐
│ factor ┆ n   ┆ dataset ┆ μ    ┆ s     │
│ ---    ┆ --- ┆ ---     ┆ ---  ┆ ---   │
│ str    ┆ i64 ┆ str     ┆ f64  ┆ f64   │
╞════════╪═════╪═════════╪══════╪═══════╡
│ Shelf1 ┆ 20  ┆ fiber   ┆ 1.68 ┆ 1.166 │
│ Shelf2 ┆ 20  ┆ fiber   ┆ 0.95 ┆ 1.162 │
│ Shelf3 ┆ 20  ┆ fiber   ┆ 2.17 ┆ 1.277 │
│ Shelf1 ┆ 20  ┆ sugar   ┆ 4.8  ┆ 2.138 │
│ Shelf2 ┆ 20  ┆ sugar   ┆ 9.85 ┆ 1.985 │
│ Shelf3 ┆ 20  ┆ sugar   ┆ 6.1  ┆ 1.865 │
└────────┴─────┴─────────┴──────┴───────┘


#### (a)

For (a), remember to use the _pooled_ $\hat s$.
$$
\text{CI} = \hat\mu_\text{group} \pm t_{N-a,\ \alpha/2} \frac{\hat{s}_\text{pool}}{\sqrt{n_\text{group}}}
$$

In [4]:
def CI(data: pl.DataFrame, α: float=0.05) -> pl.DataFrame:
    """ 
    returns the studentized CIs for each group in the data using a pooled s.

    data columns:
    - dataset: string id of dataset
    - factor: distinct for different treatment groups
    - n: group size
    - μ: group mean
    - s: group standard deviation
    """
    dof = col('n').sum() - pl.len()
    t_crit = dof.map_elements(lambda v: stats.t.ppf(1-α/2, v))
    s_pool = ((col('s')**2 * (col('n') - 1)).sum() / dof).sqrt()
    
    e = t_crit * s_pool / col('n').sqrt()
    return (
        data
        .group_by('dataset')
        .agg(
            'factor',
            CI=pl.concat_list(col('μ') - e, col('μ') + e))
        .explode('factor', 'CI'))
            
print(ans := CI(data))

shape: (6, 3)
┌─────────┬────────┬───────────────────────┐
│ dataset ┆ factor ┆ CI                    │
│ ---     ┆ ---    ┆ ---                   │
│ str     ┆ str    ┆ list[f64]             │
╞═════════╪════════╪═══════════════════════╡
│ sugar   ┆ Shelf1 ┆ [3.904862, 5.695138]  │
│ sugar   ┆ Shelf2 ┆ [8.954862, 10.745138] │
│ sugar   ┆ Shelf3 ┆ [5.204862, 6.995138]  │
│ fiber   ┆ Shelf1 ┆ [1.141407, 2.218593]  │
│ fiber   ┆ Shelf2 ┆ [0.411407, 1.488593]  │
│ fiber   ┆ Shelf3 ┆ [1.631407, 2.708593]  │
└─────────┴────────┴───────────────────────┘


Let's visualize the CIs.

In [5]:
(
    alt.Chart(
        ans.select('dataset', 'factor',
            low=col('CI').list.first(),
            high=col('CI').list.last()))
    .mark_line(strokeWidth=2)
    .encode(
        alt.X('low').title('CI'),
        alt.X2('high'),
        alt.Y('factor').title(None),
        alt.Row('dataset').title(None))
    .resolve_scale(x='independent')
)

- Sugar: The CIs of Shelf1 and Shelf3 overlap, so no obvious differences. But Shelf2 stands out from the rest.
- Fiber: By the same token, Shelf2 and Shelf3 are different becuuse their CIs do not overlap.

#### (b)

In [14]:
def ANOVA(data: pl.DataFrame) -> pl.DataFrame:
    """ 
    returns the 1-way ANOVA table as a DataFrame.

    assuming the correct conditions of data:
    - normality assumption
    - constant variance

    data columns:
    - dataset: string id of dataset
    - n: group size
    - μ: group mean
    - s: group standard deviation
    """
    def item(expr: pl.Expr) -> float:
        return data.select(expr).item()

    μ_total = (col('μ') * col('n')).sum() / col('n').sum()
    
    # factor A sum of squares
    ssa = ((col('μ') - μ_total)**2 * col('n')).sum()
    dof_ssa = pl.len() - 1
    
    # error sum of squares
    sse = (col('s')**2 * (col('n') - 1)).sum()
    dof_sse = col('n').sum() - pl.len()
    
    msa = ssa / dof_ssa # factor A mean square
    mse = sse / dof_sse # error mean square

    return (
        data
        .group_by('dataset')
        .agg(ssa=ssa, dof_ssa=dof_ssa, sse=sse, dof_sse=dof_sse, msa=msa, mse=mse)
        .select(
            'dataset', 
            source=pl.concat_list(lit('factor'), lit('error'), lit('total')),
            SS=pl.concat_list('ssa', 'sse', col('ssa') + col('sse')),
            dof=pl.concat_list('dof_ssa', 'dof_sse', col('dof_ssa') + col('dof_sse')),
            MS=pl.concat_list('msa', 'mse', None),
            F=pl.concat_list(col('msa') / col('mse'), None, None))
        .explode('source', 'SS', 'dof', 'MS', 'F'))


print(ans := ANOVA(data))

shape: (6, 6)
┌─────────┬────────┬────────────┬─────┬────────────┬───────────┐
│ dataset ┆ source ┆ SS         ┆ dof ┆ MS         ┆ F         │
│ ---     ┆ ---    ┆ ---        ┆ --- ┆ ---        ┆ ---       │
│ str     ┆ str    ┆ f64        ┆ i64 ┆ f64        ┆ f64       │
╞═════════╪════════╪════════════╪═════╪════════════╪═══════════╡
│ sugar   ┆ factor ┆ 275.033333 ┆ 2   ┆ 137.516667 ┆ 34.409292 │
│ sugar   ┆ error  ┆ 227.800386 ┆ 57  ┆ 3.996498   ┆ null      │
│ sugar   ┆ total  ┆ 502.833719 ┆ 59  ┆ null       ┆ null      │
│ fiber   ┆ factor ┆ 15.076     ┆ 2   ┆ 7.538      ┆ 5.209964  │
│ fiber   ┆ error  ┆ 82.470051  ┆ 57  ┆ 1.446843   ┆ null      │
│ fiber   ┆ total  ┆ 97.546051  ┆ 59  ┆ null       ┆ null      │
└─────────┴────────┴────────────┴─────┴────────────┴───────────┘


On the other hand the 95% critical value $f(2, 57)$ is

In [15]:
stats.f(2, 57).ppf(0.95)

3.1588427192606465

Therefore both the sugar and fibert content F ratios (34.4 and 5.2 respectively) are greater than the critical value, meaning there are significant differences among the shelves.

#### (c)

Shelf 2 contains cereals that are high in suger and low in fiber (refer to the line chart in (b)), in other words, "taste good". The grocery store's strategy is to place those where grade schoolers can easily see.

### 2.

In [16]:
with open('Ex12-2.json') as file:
    data = pl.from_dict(json.load(file))
print(data)

shape: (30, 2)
┌───────┬─────────┐
│ mg    ┆ taprate │
│ ---   ┆ ---     │
│ f64   ┆ f64     │
╞═══════╪═════════╡
│ 0.0   ┆ 242.0   │
│ 0.0   ┆ 245.0   │
│ 0.0   ┆ 244.0   │
│ 0.0   ┆ 248.0   │
│ 0.0   ┆ 247.0   │
│ …     ┆ …       │
│ 200.0 ┆ 250.0   │
│ 200.0 ┆ 246.0   │
│ 200.0 ┆ 248.0   │
│ 200.0 ┆ 245.0   │
│ 200.0 ┆ 250.0   │
└───────┴─────────┘


In [19]:
data = (
    data.cast({'mg': int, 'taprate': int})
    .select(
        dose=pl.format('{} mg', col('mg')),
        taprate='taprate'))
print(data)

shape: (30, 2)
┌────────┬─────────┐
│ dose   ┆ taprate │
│ ---    ┆ ---     │
│ str    ┆ i64     │
╞════════╪═════════╡
│ 0 mg   ┆ 242     │
│ 0 mg   ┆ 245     │
│ 0 mg   ┆ 244     │
│ 0 mg   ┆ 248     │
│ 0 mg   ┆ 247     │
│ …      ┆ …       │
│ 200 mg ┆ 250     │
│ 200 mg ┆ 246     │
│ 200 mg ┆ 248     │
│ 200 mg ┆ 245     │
│ 200 mg ┆ 250     │
└────────┴─────────┘


#### (a)

In [29]:
(
    alt.Chart(data)
    .mark_boxplot()
    .encode(
        alt.Y('dose'),
        alt.X('taprate').scale(zero=False))
)

The chart seems to indicate different effects among the doses.

#### (b)

In [35]:
ans = ANOVA(
    data
    .group_by('dose')
    .agg(
        n=pl.len(),
        μ=col('taprate').mean(),
        s=col('taprate').std())
    .with_columns(dataset=pl.lit('caffeine'))
)
print(ans)

shape: (3, 6)
┌──────────┬────────┬───────┬─────┬──────────┬──────────┐
│ dataset  ┆ source ┆ SS    ┆ dof ┆ MS       ┆ F        │
│ ---      ┆ ---    ┆ ---   ┆ --- ┆ ---      ┆ ---      │
│ str      ┆ str    ┆ f64   ┆ u32 ┆ f64      ┆ f64      │
╞══════════╪════════╪═══════╪═════╪══════════╪══════════╡
│ caffeine ┆ factor ┆ 61.4  ┆ 2   ┆ 30.7     ┆ 6.181208 │
│ caffeine ┆ error  ┆ 134.1 ┆ 27  ┆ 4.966667 ┆ null     │
│ caffeine ┆ total  ┆ 195.5 ┆ 29  ┆ null     ┆ null     │
└──────────┴────────┴───────┴─────┴──────────┴──────────┘


On the other hand:

In [36]:
stats.f(2, 27).ppf(0.9)

2.5106086665585408

2.51 < 6.18, so yes, there are significant differences.

#### (c)

In [73]:
chart_base = (
    alt.Chart(
        data
        .with_columns(
            residual=col('taprate') - col('taprate').mean().over('dose'))
        .with_columns(
            normal_score=(col('residual').rank() / (pl.len() + 1)).map_batches(
                lambda x: stats.norm.ppf(x))))
    .mark_circle())

(
    chart_base.encode(
        x='residual', 
        y='normal_score')
    | chart_base.encode(
        x='dose',
        y='residual'))

From the charts: the residuals are fairly normally distributed, and the variance is consistant across different doses.

### 3.

In [74]:
with open('Ex12-3.json') as file:
    data = pl.from_dict(json.load(file))
print(data)

shape: (75, 2)
┌────────┬─────────┐
│ AvgEgg ┆ Group   │
│ ---    ┆ ---     │
│ f64    ┆ str     │
╞════════╪═════════╡
│ 35.4   ┆ Control │
│ 27.4   ┆ Control │
│ 19.3   ┆ Control │
│ 41.8   ┆ Control │
│ 20.3   ┆ Control │
│ …      ┆ …       │
│ 15.1   ┆ Suscept │
│ 31.0   ┆ Suscept │
│ 16.9   ┆ Suscept │
│ 16.1   ┆ Suscept │
│ 10.8   ┆ Suscept │
└────────┴─────────┘


#### (a)

## Section 12.2