### Introduction

This demonstration uses 2011 Census data, available [here](https://www.ons.gov.uk/census/2011census/2011censusdata/2011censusdatacatalogue).

I have chosen several different sub-datasets to demonstrate relevant suitable visualisations.

1. [Population and household estimates](#section1)

    1.1 [Gender ratio](#section1_1)

    1.2 [Gender by region](#section1_2)

    1.3 [Population by outward postcode using choropleth map](#section1_3)

    1.4 [Population by postcode parent area using choropleth map](#section1_4)

    1.5 [Male/Female ratio by postcode parent area using choropleth map](#section1_5)

### Preparation

In [None]:
# Install required packages
%pip install pandas plotly geojson shapely

In [None]:
# Import required packages
import pandas as pd
import json
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import re
import shapely.geometry
from shapely.ops import unary_union
import geojson

# Set default pandas plotting backend to Plotly
pd.options.plotting.backend = "plotly"

<a id='section1'></a>
### 1. Population and household estimates

In [None]:
# Load data
df1 = pd.read_csv("../data/census2011/population_household.csv")

# Split postcode into outward and inward, e.g. LE3 9QP => LE3 and 9QP
df1["postcode_out"] = df1["Postcode"].apply(lambda x: x[0:4].strip())
df1["postcode_in"] = df1["Postcode"].apply(lambda x: x[4:].strip())

# Keep only non-numeric part in outward as the parent region
df1["postcode_region"] = df1["postcode_out"].apply(lambda x: re.split("\d+", x)[0])

# Remove column [Postcode] to save memory
df1 = df1.drop(["Postcode"], axis=1)

<a id='section1_1'></a>
#### 1.1 Gender ratio

In [None]:
# Count total number of males and females
df_gender = pd.DataFrame({"Gender": ["Male", "Female"], "Count": [df1["Males"].sum(), df1["Females"].sum()]})

# Generate pie chart
gender_pie = px.pie(df_gender, values="Count", names="Gender", title="National gender ratio (England and Wales)")
gender_pie.show()

# Remove variables to save memory
del df_gender, gender_pie

<a id='section1_2'></a>
#### 1.2 Gender by region

In [None]:
# Sum up all numbers within same postcode region, descending order by column [Total]
df_temp = df1.groupby(["postcode_region"]).sum().sort_values(by=['Total'], ascending=False).reset_index()

# Generate grouped bar chart
bar_x = ["Total", "Males", "Females"]
bar_data = []
for index, row in df_temp.head(10).iterrows():
    bar_data.append(go.Bar(name=row["postcode_region"], x=bar_x, y=row[bar_x], text=row[bar_x], textposition="auto"))
fig_temp = go.Figure(data=bar_data)
fig_temp.update_layout(barmode='group', title="Population by regions (Top 10)")
fig_temp.show()

# Remove variables to save memory
del df_temp, bar_x, bar_data, fig_temp

<a id='section1_3'></a>
#### 1.3 Population by outward postcode using choropleth map
Concat individual postcode geojson mapping into single variable `geojson_uk`.

In [None]:
# Create parent geojson collection object
geojson_uk_granulated = dict(type="FeatureCollection", features=list())
# Load all geojson files
path_geojson = Path("../data/geojson")
for f_geojson in list(path_geojson.glob("*.geojson")):
    with open(f_geojson) as f:
        geojson_data = json.load(f)
        for feature in geojson_data["features"]:
            # Add feature id using properties.name, which is the outward postcode
            feature["id"] = feature["properties"]["name"]
            # Add feature to parent geojson collection
            geojson_uk_granulated["features"].append(feature)

Plot population with postcode area. Please note that as the opensource geographical data I used in this demonstration comes from Wikipedia, it does not cover all England regions. This leads to white areas on the map.

In [None]:
# Sum up all numbers within same postcode outward area
df_temp = df1.groupby(["postcode_out"]).sum().reset_index()

# Get max population number among areas
p_max = df_temp["Total"].max()

# Plot figure
fig = px.choropleth_mapbox(df1.groupby(["postcode_out"]).sum().reset_index(), geojson=geojson_uk_granulated,
                           locations='postcode_out', color='Total',
                           color_continuous_scale="jet",
                           range_color=(0, p_max),
                           mapbox_style="carto-positron",
                           zoom=4.5, center={"lat": 52.5, "lon": -1.6},
                           opacity=0.5
                           )
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()

# Remove variables to save memory
del df_temp, p_max, fig

<a id='section1_4'></a>
#### 1.4 Population by postcode parent area using choropleth map
In the above plot, regions are probably too granulated, so you would not be able to see an obvious trend. Let's merge them into parent regions.

In [None]:
# Create parent geojson collection object
geojson_uk_regional = dict(type="FeatureCollection", features=list())
# Load all geojson files
path_geojson = Path("../data/geojson")
for f_geojson in list(path_geojson.glob("*.geojson")):
    with open(f_geojson) as f:
        geojson_data = json.load(f)
        # Merge granulated geometry areas into parent region
        merged = unary_union(
            list(
                map(
                    (lambda x: shapely.geometry.asShape(x["geometry"])),
                    geojson_data["features"])
            )
        )
        # Create new geojson feature from merged geometry
        geojson_merged = geojson.Feature(geometry=merged,
                                         properties={"name": f_geojson.stem},
                                         id=f_geojson.stem)
        # Add feature to parent geojson collection
        geojson_uk_regional["features"].append(geojson_merged)

In [None]:
# Sum up all numbers within same postcode region
df_temp = df1.groupby(["postcode_region"]).sum().reset_index()

# Get max population number among regions
p_max = df_temp["Total"].max()

# Plot figure
fig = px.choropleth_mapbox(df_temp,
                           geojson=geojson_uk_regional,
                           locations='postcode_region',
                           color='Total',
                           color_continuous_scale="jet",
                           range_color=(0, p_max),
                           mapbox_style="carto-positron",
                           zoom=4.5, center={"lat": 52.5, "lon": -1.6},
                           opacity=0.5
                           )
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()

# Remove variables to save memory
del df_temp, p_max, fig

<a id='section1_5'></a>
#### 1.5 Male/Female ratio by postcode parent area using choropleth map

In [None]:
# Sum up all numbers within same postcode region
df_temp = df1.groupby(["postcode_region"]).sum().reset_index()

# Calculate M/F ratio and save into column [mf_ratio]
df_temp["mf_ratio"] = df_temp["Males"] / df_temp["Females"]

# Get range for M/F ratio
r_min, r_max = df_temp["mf_ratio"].min(), df_temp["mf_ratio"].max()

# Plot figure
fig = px.choropleth_mapbox(df_temp,
                           geojson=geojson_uk_regional,
                           locations='postcode_region',
                           color='mf_ratio',
                           color_continuous_scale="jet",
                           range_color=(r_min, r_max),
                           mapbox_style="carto-positron",
                           zoom=4.5, center={"lat": 52.5, "lon": -1.6},
                           opacity=0.5
                           )
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()

# Remove variables to save memory
del df_temp, r_min, r_max, fig

In [None]:
# Remove dataframe used for section 1
del df1

<a id='section2'></a>
### 2. Age structure

In [None]:
df2 = pd.read_excel("../data/census2011/age_structure.xls", sheet_name="KS102EW_Numbers", skiprows=10, skipfooter=1)
df2 = df2.rename(columns={df2.columns[1]:"nation", df2.columns[2]: "region", df2.columns[3]: "area"}).dropna(subset=["Area code"]).reset_index(drop=True)
df2