# Ch 3. Describing Data

### Imports

In [None]:
import DataFrames as Dfs
import VegaLite as Vl
import Plots as Plts
import Statistics as Stats
import PyPlot as Pyplt
import Seaborn as Sns

## Data, functions, exploratory analysis, etc.

### Tab 3.1

In [None]:
tab31 = Dfs.DataFrame((
        age=[7,7,8,8,8,9,11,12,12,13,13,14,14,15,16,17,17,17,17,19,19,20,23,23,23],
        piMax=[80, 85,110,95,95,100,45,95,130,75,80,70,80,100,120,110,125,75,100,40,75,110,150,75,95]))
Dfs.first(tab31, 3)

In [None]:
Dfs.describe(tab31)

In [None]:
tab31 |>
Vl.@vlplot(
    mark={"circle", size=200}, 
    title="Fig. 3.15. Scatter diagram of PImax by age",
    encoding={
        x={field="age", axis={ title="Age [Years]"}},
        y={field="piMax", axis={title="PiMax [cm H₂O]"}},
    }, 
    background="white",
    width=600,
    height=300,
)

### Tab 3.2

In [None]:
tab32 = pd.DataFrame((
        igm=vcat(collect(0.1:0.1:1.8), [2.0, 2.1, 2.2, 2.5, 2.7, 4.5]),
        numOfChildren=[3,7,19,27,32,35,38,38,22,16,16,6,7,9,6,2,3,3,3,2,1,1,1,1]))
first(tab32, 3)

In [None]:
pd.describe(tab32)

In [None]:
plts.histogram(tab32[:, :igm], bins=0.0:0.1:4.7, weights=tab32[:, :numOfChildren], legend=false)
plts.title!("Fig. 3.7. (a) Concentration of IgM in\n298 children aged 6 months to 6 years")
plts.xlabel!("IgM [g/l]")
plts.ylabel!("Number of children")

In [None]:
plts.plot(tab32[:, :igm], tab32[:, :numOfChildren], legend=false)
plts.title!("Fig. 3.7. (b) Concentration of IgM in\n298 children aged 6 months to 6 years")
plts.xlabel!("IgM [g/l]")
plts.ylabel!("Number of children")

In [None]:
cumFreq = cumsum(tab32[:, :numOfChildren])
cumFreq = cumFreq ./ cumFreq[end] .* 100
plts.histogram(tab32[:, :igm], bins=0.0:0.1:4.7, weights=cumFreq, legend=false)
plts.title!("Fig 3.10 (a) Concentration of IgM in\n298 children aged 6 months to 6 years.")
plts.xlabel!("IgM [g/l]")
plts.ylabel!("Cumulative frequency [%]")

In [None]:
plts.plot(tab32[:, :igm], cumFreq, legend=false)
plts.title!("Fig 3.10 (b) Concentration of IgM in\n298 children aged 6 months to 6 years.")
plts.xlabel!("IgM [g/l]")
plts.ylabel!("Cumulative frequency [%]")

### Tab 3.3

In [None]:
ages = [4,9,15,16,17,19,24,59,80] # there are
# different width age groups e.g 0-4, 25-59, > 60
# assumed upper limit is 80
frequencies = [28,46,58,20,31,64,149,316,103];

In [None]:
plts.histogram(ages, bins=vcat([0], ages), weights=frequencies, legend=false)
plts.title!("Fig. 3.5. Incorrect histogram\nof road accident data (tab 3.3)")
plts.xlabel!("Age [years]")
plts.ylabel!("Number of accidents")

In [None]:
plts.histogram(ages, bins=vcat([0], ages), weights=frequencies, legend=false, normalize=true)
plts.title!("Fig. 3.6. Correct histogram\nof road accident data (tab 3.3)")
plts.xlabel!("Age [years]")
plts.ylabel!("Frequency per year of age")

### Tab 3.6

In [None]:
# consumption in gram per person per week
# I reshaped the table from the book slightly
tab36 = pd.DataFrame((
    bread_type = repeat(["white", "brown", "wholemeal", "other"], inner=5),
    year = repeat([1960, 1965, 1970, 1975, 1980], outer=4),
    consumption = [1040, 975, 915, 785, 620, 70, 80, 70, 75, 115, 25, 20, 15, 20, 45, 155, 80, 85, 75, 105]
))
first(tab36, 4)

### Figure 3.16

In [None]:
sns.barplot(x=tab36[!, "year"], y=tab36[!, "consumption"],
            hue=tab36[!, "bread_type"])
pyplt.legend(title="Bread type")
pyplt.title("Figure 3.16 (a)")
pyplt.xlabel("Year")
pyplt.ylabel("Consumption of bread\n(g per person per week)");

In [None]:
sns.barplot(x=tab36[!, "year"], y=tab36[!, "consumption"],
    hue=tab36[!, "bread_type"], dodge=false)
pyplt.legend(title="Bread type")
pyplt.title("Figure 3.16 (b)")
pyplt.xlabel("Year")
pyplt.ylabel("Consumption of bread\n(g per person per week)");

In [None]:
tab36[!, "intake_1960"] = repeat(
    tab36[tab36[!, "year"] .== 1960, "consumption"],
    inner=5)
tab36[!, "perc_1960_intake"] = (
    tab36[!, "consumption"] ./ tab36[!, "intake_1960"] .* 100);

In [None]:
sns.lineplot(x=tab36[!, "year"], y=tab36[!, "perc_1960_intake"], 
    hue=tab36[!, "bread_type"])
sns.scatterplot(x=tab36[!, "year"], y=tab36[!, "perc_1960_intake"], 
    hue=tab36[!, "bread_type"], legend=false)
pyplt.legend(title="Bread type")
pyplt.ylim((0, 200))
pyplt.xticks(ticks=[1960, 1965, 1970, 1975, 1980])
pyplt.title("Figure 3.16 (c)")
pyplt.xlabel("Year")
pyplt.ylabel("Intake per person per year\n(% of 1960 intake)");