# 1. Get motivated!

Question 1.1

The most important message is that technological progress is happening faster than expected, and our binary thinking is less useful in today's society. The most effective visualization is a dynamic bubble chart, used to observe the rise and convergence of income levels across countries, providing a visual understanding of global progress and the inequalities that still exist.

# 2. The Gapminder bubble chart

Question 2.1

In [None]:
import pandas as pd
import altair as alt

DATA = "https://raw.githubusercontent.com/UofTCoders/workshops-dc-py/master/data/processed/world-data-gapminder.csv"
gapminder = pd.read_csv(DATA, parse_dates=['year'])

gap_1962 = gapminder[gapminder["year"] == "1962-01-01"].copy()
gap_1962 = gap_1962.dropna(subset=["children_per_woman", "life_expectancy", "region", "population"])

chart_1962 = (
    alt.Chart(gap_1962)
    .mark_circle(opacity=0.7)
    .encode(
        x=alt.X("children_per_woman:Q", title="Children per woman"),
        y=alt.Y("life_expectancy:Q", title="Life expectancy"),
        color=alt.Color("region:N", title="Region"),
        size=alt.Size("population:Q", title="Population"),
        tooltip=[
            alt.Tooltip("country:N", title="Country"),
            alt.Tooltip("region:N", title="Region"),
            alt.Tooltip("children_per_woman:Q", title="Children per woman", format=".2f"),
            alt.Tooltip("life_expectancy:Q", title="Life expectancy", format=".1f"),
            alt.Tooltip("population:Q", title="Population", format=",.0f"),
        ],
    )
    .properties(width=650, height=420, title="Gapminder-style bubble chart (1962)"
    )
)

chart_1962


Question 2.2

In [None]:
# %pip install rpy2
# %load_ext rpy2.ipython

In [None]:
%%R -w 550 -h 375

library(readr)
library(dplyr)
library(ggplot2)

gapminder <- read_csv("https://raw.githubusercontent.com/UofTCoders/workshops-dc-py/master/data/processed/world-data-gapminder.csv")
gapminder %>%
  filter(year == "1962") %>%
  ggplot(aes(x = children_per_woman, y = life_expectancy, color = region, size = population
  )) +
  geom_point(alpha = 0.7) +
  labs( title = "Gapminder-style bubble chart (1962)", x = "Children per woman", y = "Life expectancy"
  ) +
  theme_minimal()

# 3. Education balance

Question 3.1


In [None]:
import pandas as pd
import altair as alt

gapminder = gapminder.copy()
gapminder["edu_ratio_women_men"] = (
    gapminder["years_in_school_women"] / gapminder["years_in_school_men"]
)

df_edu = gapminder[gapminder["year"].dt.year.between(1970, 2015)
].dropna(subset=["edu_ratio_women_men", "income_group"])

edu_mean = (
    df_edu.groupby(["income_group", "year"], as_index=False)
          .agg(mean_ratio=("edu_ratio_women_men", "mean"))
)

base = alt.Chart(edu_mean).encode(
    x=alt.X("year:T", title="Year"),
    y=alt.Y("mean_ratio:Q", title="Mean (women's years / men's years)"),
    color=alt.Color("income_group:N", title="Income group"),
    tooltip=[alt.Tooltip("income_group:N", title="Income group"), alt.Tooltip("year:T", title="Year"), alt.Tooltip("mean_ratio:Q", title="Mean ratio", format=".3f"),
    ],
)

line = base.mark_line()
points = base.mark_square(size=60)

chart = (line + points).properties(width=700, height=420, title="Education balance over time"
)

chart


Question 3.2

In [None]:
%%R -w 550 -h 375

library(readr)
library(dplyr)
library(ggplot2)

edu_mean <- gapminder %>%
  mutate(edu_ratio_women_men = years_in_school_women / years_in_school_men
  ) %>%
  filter(year >= 1970 & year <= 2015) %>%
  group_by(income_group, year) %>%
  summarise(
    mean_ratio = mean(edu_ratio_women_men, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(
  edu_mean,
  aes(x = year, y = mean_ratio, color = income_group, group = income_group
  )
) +
  geom_line() +
  geom_point(shape = 15, size = 2.5) +
  labs(title = "Education balance over time", x = "Year", y = "Mean"
  ) +
  theme_minimal()


Question 3.3 (Optional)

In [None]:
import numpy as np
import altair as alt

def bootstrap_mean_ci(x: np.ndarray, n_boot: int = 300, ci: float = 0.95, seed: int = 0):
    x = x[~np.isnan(x)]
    if len(x) == 0:
        return (np.nan, np.nan)
    rng = np.random.default_rng(seed)
    boots = rng.choice(x, size=(n_boot, len(x)), replace=True).mean(axis=1)
    alpha = 1 - ci
    return (np.quantile(boots, alpha / 2), np.quantile(boots, 1 - alpha / 2))

ci_rows = []
for (inc, yr), g in df_edu.groupby(["income_group", "year"]):
    lo, hi = bootstrap_mean_ci(g["edu_ratio_women_men"].to_numpy(), n_boot=300, ci=0.95, seed=0)
    ci_rows.append({"income_group": inc, "year": yr, "ci_lower": lo, "ci_upper": hi})

edu_ci = pd.DataFrame(ci_rows)

edu_summary = edu_mean.merge(edu_ci, on=["income_group", "year"], how="left")

base = alt.Chart(edu_summary).encode(
    x=alt.X("year:T", title="Year"),
    color=alt.Color("income_group:N", title="Income group"),
    tooltip=[alt.Tooltip("income_group:N", title="Income group"), alt.Tooltip("year:T", title="Year"), alt.Tooltip("mean_ratio:Q", title="Mean ratio", format=".3f"), alt.Tooltip("ci_lower:Q", title="CI lower", format=".3f"), alt.Tooltip("ci_upper:Q", title="CI upper", format=".3f"),
    ],
)

band = base.mark_area(opacity=0.2).encode(y=alt.Y("ci_lower:Q", title="Mean"), y2="ci_upper:Q",)
line = base.mark_line().encode(y="mean_ratio:Q")
points = base.mark_square(size=60).encode(y="mean_ratio:Q")
chart_ci = (band + line + points).properties(width=700, height=420, title="Education balance over time with 95% bootstrap CI")

chart_ci


Question 3.4 (Optional)

In [15]:
#%%R
#install.packages("Hmisc", repos = "https://cloud.r-project.org")

In [None]:
%%R -w 550 -h 375

library(readr)
library(dplyr)
library(ggplot2)
library(Hmisc)

edu_df <- gapminder %>%
  mutate(edu_ratio_women_men = years_in_school_women / years_in_school_men) %>%
  filter(year >= 1970 & year <= 2015) %>%
  filter(!is.na(edu_ratio_women_men), !is.na(income_group))

ggplot(edu_df,
  aes(x = year, y = edu_ratio_women_men, color = income_group, fill = income_group, group = income_group)) +
  geom_ribbon(stat = "summary", fun.data = mean_cl_boot, alpha = 0.2, color = NA) +
  stat_summary(fun = mean, geom = "line", linewidth = 1) +
  stat_summary(fun = mean, geom = "point", shape = 15, size = 2.5) +
  labs(title = "Education balance over time with 95% bootstrap CI", x = "Year", y = "Mean") +
  
  theme_minimal()


# 4. Family planning

Question 4.1

In [None]:
import pandas as pd
import altair as alt

years_keep = [1918, 1938, 1958, 1978, 1998, 2018]
years_keep_dt = pd.to_datetime(years_keep, format="%Y")

df_fp = gapminder[gapminder["year"].isin(years_keep_dt)].copy()
df_fp = df_fp.dropna(subset=["children_per_woman", "child_mortality", "income_group", "year"])

chart_41 = (
    alt.Chart(df_fp)
    .mark_circle(filled=True, size=60, opacity=0.75)
    .encode(
        x=alt.X("children_per_woman:Q", title="Children per woman"),
        y=alt.Y("child_mortality:Q", title="Child mortality"),
        color=alt.Color("income_group:N", title="Income group"),
        tooltip=[alt.Tooltip("country:N", title="Country"), alt.Tooltip("income_group:N", title="Income group"), alt.Tooltip("children_per_woman:Q", title="Children per woman", format=".2f"), alt.Tooltip("child_mortality:Q", title="Child mortality", format=".1f"), alt.Tooltip("year:T", title="Year"),
        ],
    )
    .facet( facet=alt.Facet("year:T", title=None), columns=3
    )
    .properties(title="children per woman and child mortality"
    )
)

chart_41

Question 4.2

In [None]:
%%R -w 800 -h 450

library(readr)
library(dplyr)
library(ggplot2)

years_keep <- c(1918, 1938, 1958, 1978, 1998, 2018)

df_fp <- gapminder %>%
  filter(year %in% years_keep) %>%
  filter(!is.na(children_per_woman), !is.na(child_mortality), !is.na(income_group))

ggplot(df_fp,
  aes(x = children_per_woman, y = child_mortality, color = income_group
  )
) +
  geom_point(shape = 1, size = 2.2, alpha = 0.8
  ) +
  facet_wrap(~ year, ncol = 3) +
  labs(title = "Children per woman and child mortality", x = "Children per woman", y = "Child mortality", color = "Income group"
  ) +

  theme_minimal()

# Carbon dioxide emissions

Question 5.1

In [None]:
import pandas as pd
import altair as alt

latest_year = (gapminder.dropna(subset=["co2_per_capita"])["year"].max())

df_latest = gapminder[
    (gapminder["year"] == latest_year) &
    (~gapminder["co2_per_capita"].isna())
]

top40 = df_latest.nlargest(40, "co2_per_capita")

chart_51 = (alt.Chart(top40).mark_bar().encode(
        x=alt.X("co2_per_capita:Q", title="CO₂ emissions per capita"
        ),
        y=alt.Y("country:N", sort=alt.SortField(field="co2_per_capita", order="ascending"),title="Country"
        ),
        color=alt.Color("region:N", title="Region"
        ),
        tooltip=[alt.Tooltip("country:N", title="Country"), alt.Tooltip("region:N", title="Region"), alt.Tooltip("co2_per_capita:Q", title="CO₂ per capita", format=".2f"),
        ],
    )
    .properties(width=700, height=600, title=f"Top 40 countries by CO2 emissions per capita"
    )
)

chart_51


Question 5.2

In [None]:
%%R -w 500 -h 800

library(dplyr)
library(ggplot2)

latest_year <- gapminder %>%
  filter(!is.na(co2_per_capita)) %>%
  summarise(max_year = max(year)) %>%
  pull(max_year)

df_latest <- gapminder %>%
  filter(
    year == latest_year,
    !is.na(co2_per_capita)
  )

top40 <- df_latest %>%
  slice_max(order_by = co2_per_capita, n = 40) %>%
  arrange(co2_per_capita)  

ggplot(
  top40,
  aes(
    x = co2_per_capita,
    y = reorder(country, -co2_per_capita), 
    fill = region
  )
) +
  geom_col(alpha = 0.8) +
  labs(
    title = paste0(
      "Top 40 Countries by CO2 Emissions per Capita (",
      latest_year,
      ")"
    ),
    x = "CO₂ Emissions per Capita",
    y = "Country",
    fill = "Region"
  ) +
  theme_minimal(base_size = 14)


Question 5.3

In [None]:
import pandas as pd
import altair as alt

gapminder["co2_total"] = gapminder["co2_per_capita"] * gapminder["population"]

region_co2 = (gapminder.dropna(subset=["co2_total"]).groupby(["year", "region"], as_index=False).agg(co2_total=("co2_total", "sum")))

area_chart = (alt.Chart(region_co2).mark_area().encode(
        x=alt.X("year:T", title="Year"),
        y=alt.Y("co2_total:Q", title="Total CO₂ emissions", stack="zero"
        ),
        color=alt.Color("region:N", title="Region"),
        tooltip=[alt.Tooltip("region:N"), alt.Tooltip("co2_total:Q", format=",.2f")
        ]
    )
    .properties(width=700, height=400, title="Total CO2 emissions by region over time"
    )
)

area_chart

Question 5.4


In [None]:
%%R -w 550 -h 375

library(readr)
library(dplyr)
library(ggplot2)

gapminder <- gapminder %>%
  mutate(co2_total = co2_per_capita * population)

region_co2 <- gapminder %>%
  filter(!is.na(co2_total)) %>%
  group_by(year, region) %>%
  summarise(
    co2_total = sum(co2_total),
    .groups = "drop"
  )

ggplot(region_co2, aes(x = year, y = co2_total, fill = region)) +
  geom_area(position = "stack") +
  labs(title = "Total CO2 emissions by region over time", x = "Year", y = "Total CO₂ emissions", fill = "Region"
  ) +
  theme_minimal()


# 6. Income distribution

Question 6.1 (Optional)


In [None]:
import pandas as pd
import altair as alt

years_keep = [1979, 1991, 2003, 2015]
gapminder["year_int"] = gapminder["year"].dt.year

df_income = (
    gapminder.loc[gapminder["year_int"].isin(years_keep), ["year_int", "income"]]
    .dropna(subset=["income"])
)

hist = (alt.Chart(df_income).mark_bar().encode(
        x=alt.X("income:Q", bin=alt.Bin(maxbins=30), title="Income"),
        y=alt.Y("count():Q", title="Count of countries"),
        tooltip=[alt.Tooltip("count():Q", title="Count")
        ]
    )
    .properties(width=170, height=220)
)

facet_hist = (
hist.facet(
        column=alt.Column("year_int:O", title=None, sort=years_keep)
    )
    .properties(title="Income distribution across years")
)

facet_hist


From 1979 to 2015, the income distribution shifted significantly to the right, with the number of low-income countries decreasing and the number of middle- and higher-income countries increasing. This trend aligns with Rosling's 2003 prediction of the world income distribution in 2015. The histogram used here, instead of a logarithmic density plot, makes the distribution shape less clear.

Question 6.2 (Optional)

In [None]:
%%R -w 1000 -h 250

library(readr)
library(dplyr)
library(ggplot2)

gapminder <- read_csv(
  "https://raw.githubusercontent.com/UofTCoders/workshops-dc-py/master/data/processed/world-data-gapminder.csv",
  show_col_types = FALSE
)

years_keep <- c(1979, 1991, 2003, 2015)

gap_income <- gapminder %>%
  mutate(
    year_int = if (is.numeric(year)) {
      as.integer(year)
    } else {
      as.integer(format(as.Date(year), "%Y"))
    }
  ) %>%
  filter(year_int %in% years_keep) %>%
  filter(!is.na(income))

ggplot(gap_income, aes(x = income)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ year_int, nrow = 1) +
  labs(title = "Income distribution across years", x = "Income", y = "Count of countries"
  ) +
  theme_minimal()


This trend aligns with predictions. From 1979 to 2015, income distribution shifted to the right, with the number of low-income countries decreasing and the number of high-income countries increasing. The overall direction of change supports the predicted trend.

# 7. Chart beautification


Question 7.1

In [None]:
import pandas as pd
import altair as alt

DATA = "https://raw.githubusercontent.com/UofTCoders/workshops-dc-py/master/data/processed/world-data-gapminder.csv"
gapminder = pd.read_csv(DATA)
gapminder["year"] = pd.to_numeric(gapminder["year"], errors="coerce")

year_to_plot = 1962
df = gapminder.loc[gapminder["year"] == year_to_plot].copy()
df = df.dropna(subset=["children_per_woman", "life_expectancy", "region", "population", "country"])

x_min, x_max = float(df["children_per_woman"].min()), float(df["children_per_woman"].max())
y_min, y_max = float(df["life_expectancy"].min()), float(df["life_expectancy"].max())

chart = (
    alt.Chart(df)
    .mark_circle(opacity=0.7)
    .encode(
        x=alt.X(
            "children_per_woman:Q",
            title="Children per Woman",
            scale=alt.Scale(domain=[x_min, x_max])
        ),
        y=alt.Y(
            "life_expectancy:Q",
            title="Life Expectancy",
            scale=alt.Scale(domain=[y_min, y_max])
        ),
        color=alt.Color("region:N", title="Region"),
        size=alt.Size(
            "population:Q",
            title="Population",
            scale=alt.Scale(range=[80, 3000])
        ),
        tooltip=[
            alt.Tooltip("country:N", title="Country"),
            alt.Tooltip("region:N", title="Region"),
            alt.Tooltip("children_per_woman:Q", title="Children per Woman", format=".2f"),
            alt.Tooltip("life_expectancy:Q", title="Life Expectancy", format=".1f"),
            alt.Tooltip("population:Q", title="Population", format=",.0f"),
        ],
    )
    .properties(
        title=f"Children per Woman vs Life Expectancy (Gapminder Style, {year_to_plot})",
        width=650,
        height=420,
    )
    .configure_axis(titleFontSize=18, labelFontSize=12)
    .configure_title(fontSize=22)
    .configure_legend(titleFontSize=14, labelFontSize=12)
)

chart

Question 7.2

In [None]:
%%R -w 550 -h 375

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
gapminder <- read_csv("https://raw.githubusercontent.com/UofTCoders/workshops-dc-py/master/data/processed/world-data-gapminder.csv"
)

df_1962 <- gapminder %>%
  filter(year == 1962) %>%
  drop_na(children_per_woman, life_expectancy, region, population)

ggplot(
  df_1962,
  aes(
    x = children_per_woman,
    y = life_expectancy,
    color = region,
    size = population
  )
) +
  geom_point(alpha = 0.6) +                
  scale_size(range = c(2, 14)) +            
  labs(
    title = "Children per Woman vs Life Expectancy (1962)", 
    x = "Children per Woman",                                 
    y = "Life Expectancy",
    color = "Region",
    size = "Population"
  ) +
  theme_bw() +                              
  theme(
    text = element_text(size = 14),        
    plot.title = element_text(size = 18),
    axis.title = element_text(size = 16),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )
