# Ch 3. Describing Data

### Imports

In [None]:
import DataFrames as Dfs
import VegaLite as Vl
import Plots as Plts
import Statistics as Stats
import PyPlot as Pyplt
import Seaborn as Sns

## Data, functions, exploratory analysis, etc.

### Tab 3.1

In [None]:
tab31 = Dfs.DataFrame((
        age=[7,7,8,8,8,9,11,12,12,13,13,14,14,15,16,17,17,17,17,19,19,20,23,23,23],
        piMax=[80, 85,110,95,95,100,45,95,130,75,80,70,80,100,120,110,125,75,100,40,75,110,150,75,95]))
Dfs.first(tab31, 3)

In [None]:
Dfs.describe(tab31)

In [None]:
tab31 |>
Vl.@vlplot(
    mark={"circle", size=200}, 
    title="Fig. 3.15. Scatter diagram of PImax by age",
    encoding={
        x={field="age", axis={title="Age [Years]"}},
        y={field="piMax", axis={title="PiMax [cm H₂O]"}},
    }, 
    background="white",
    width=600,
    height=300,
)

### Tab 3.2

In [None]:
tab32 = Dfs.DataFrame((
        igm=vcat(collect(0.1:0.1:1.8), [2.0, 2.1, 2.2, 2.5, 2.7, 4.5]),
        numOfChildren=[3,7,19,27,32,35,38,38,22,16,16,6,7,9,6,2,3,3,3,2,1,1,1,1]))
Dfs.first(tab32, 3)

In [None]:
Dfs.describe(tab32)

In [None]:
tab32 |>
Vl.@vlplot(
    mark={type="bar"}, 
    title=["Fig. 3.7. (a) Concentration of IgM in", "298 children aged 6 months to 6 years"],
    encoding={
        x={field="igm", axis={title="IgM [g/l]"},},
        y={field="numOfChildren", axis={title="Number of children"}},
    }, 
    background="white",
    width=600,
    height=300,
)

In [None]:
tab32 |>
Vl.@vlplot(
    mark={type="line"}, 
    title=["Fig. 3.7. (b) Concentration of IgM in", "298 children aged 6 months to 6 years"],
    encoding={
        x={field="igm", axis={title="IgM [g/l]"},},
        y={field="numOfChildren", axis={title="Number of children"}},
    }, 
    background="white",
    width=600,
    height=300,
)

In [None]:
tab32.cumFreq = cumsum(tab32[:, :numOfChildren])
tab32.cumFreq = tab32.cumFreq ./ tab32.cumFreq[end] .* 100;

In [None]:
tab32 |>
Vl.@vlplot(
    mark={type="bar"}, 
    title=["Fig. 3.10. (b) Concentration of IgM in", "298 children aged 6 months to 6 years"],
    encoding={
        x={field="igm", axis={title="IgM [g/l]"},},
        y={field="cumFreq", axis={title="Cumulative frequency [%]"}},
    }, 
    background="white",
    width=600,
    height=300,
)

In [None]:
tab32 |>
Vl.@vlplot(
    mark={type="line"}, 
    title=["Fig. 3.10. (b) Concentration of IgM in", "298 children aged 6 months to 6 years"],
    encoding={
        x={field="igm", axis={title="IgM [g/l]"},},
        y={field="cumFreq", axis={title="Cumulative frequency [%]"}},
    }, 
    background="white",
    width=600,
    height=300,
)

### Tab 3.3

In [None]:
# there are
# different width of age groups e.g 0-4, 25-59, > 60
# assumed upper limit is 80
tab33 = Dfs.DataFrame(
    age=[4,9,15,16,17,19,24,59,80],
    freq=[28,46,58,20,31,64,149,316,103]
    )

In [None]:
tab33.binStart = tab33.age .+ 0.001
tab33.binStart = vcat([0], tab33.binStart[1:end-1]);

In [None]:
tab33 |>
Vl.@vlplot(
    mark={type="bar"}, 
    title=["Fig. 3.5. Incorrect histogram", "of road accident data (tab 3.3)"],
    encoding={
        x={field="binStart", 
            bin={binned=true, step=2},
            axis={title="Age [years]", values=collect(0:10:80)},
        },
        x2={field="age", 
        }, # bin end
        y={field="freq", axis={title="Number of accidents"}},
    }, 
    background="white",
    width=600,
    height=300,
)

In [None]:
tab33.freqPerYearOfAge = tab33.freq ./ (tab33.age .- tab33.binStart);

In [None]:
tab33 |>
Vl.@vlplot(
    mark={type="bar"}, 
    title=["Fig. 3.6. Correct histogram", "of road accident data (tab 3.3)"],
    encoding={
        x={field="binStart", 
            bin={binned=true, step=2},
            axis={title="Age [years]", values=collect(0:10:80)},
        },
        x2={field="age", 
        }, # bin end
        y={field="freqPerYearOfAge", axis={title="Number of accidents"}},
    }, 
    background="white",
    width=600,
    height=300,
)

### Tab 3.6

In [None]:
# consumption in gram per person per week
# I reshaped the table from the book slightly
tab36 = Dfs.DataFrame((
    breadType = repeat(["white", "brown", "wholemeal", "other"], inner=5),
    year = repeat([1960, 1965, 1970, 1975, 1980], outer=4),
    consumption = [1040, 975, 915, 785, 620, 70, 80, 70, 75, 115, 25, 20, 15, 20, 45, 155, 80, 85, 75, 105]
))

### Figure 3.16

In [None]:
tab36 |>
Vl.@vlplot(
    mark={type="bar"}, 
    title=["Fig. 3.16. (a)"],
    encoding={
        x={field="year", type="ordinal", axis={title="Year", labelAngle=0},},
        y={field="consumption", axis={title=["Consumption of bread", "(g per person per weak)"]}},
        xOffset={field="breadType", sort=["white", "brown", "wholemeal", "other"]},
        color={field="breadType"},
    }, 
    background="white",
    width=600,
    height=300,
)

In [None]:
tab36 |>
Vl.@vlplot(
    mark={type="bar"}, 
    title=["Fig. 3.16. (b)"],
    encoding={
        x={field="year", type="ordinal", axis={title="Year", labelAngle=0},},
        y={field="consumption", aggregate="sum", axis={title=["Consumption of bread", "(g per person per weak)"]}},
        color={field="breadType", type="nominal"},

    }, 
    background="white",
    width=600,
    height=300,
)

In [None]:
tab36[!, "intake1960"] = repeat(
    tab36[tab36[!, "year"] .== 1960, "consumption"],
    inner=5)
tab36[!, "perc1960intake"] = (
    tab36[!, "consumption"] ./ tab36[!, "intake1960"] .* 100);

In [None]:
tab36 |>
Vl.@vlplot(
    mark={type="line", point=true}, 
    title=["Fig. 3.16. (c)"],
    encoding={
        x={field="year", type="ordinal", axis={title="Year", labelAngle=0},},
        y={field="perc1960intake", aggregate="mean", axis={title=["Intake per person per year", "(% of 1960 intake)"]}},
        color={field="breadType", type="nominal"},

    }, 
    background="white",
    width=600,
    height=300,
)