# Analyse cluster population distribution in Sub-Saharan Africa

In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
files = sorted([f for f in Path().glob("data/**/*-1-country-inputs.csv")])
print("Num files", len(files))

## Per-country and aggregate population distribution
With set bins, specifying both cluster and poulation counts

In [None]:
for i, f in enumerate(files):
    co = f.stem.split("-")[0]
    print(co)
    pop = pd.read_csv(f, usecols=["Pop"])
    

    pop = pop.assign(Bin=pd.cut(pop.Pop, bins=bins))
    clust_count = pop.Bin.value_counts().sort_index().rename(co)
    pop_count = pop.groupby("Bin").sum().Pop.rename(co)

    clust_pc = clust_count / clust_count.sum()
    pop_pc = pop_count / pop_count.sum()

    if i == 0:
        df_clust_count = pd.DataFrame(clust_count)
        df_clust_pc = pd.DataFrame(clust_pc)
        df_pop_count = pd.DataFrame(pop_count)
        df_pop_pc = pd.DataFrame(pop_pc)
    else:
        df_clust_count[co] = clust_count
        df_clust_pc[co] = clust_pc
        df_pop_count[co] = pop_count
        df_pop_pc[co] = pop_pc

    df_clust_count.to_csv("df_clust_count.csv")
    df_clust_pc.to_csv("df_clust_pc.csv")
    df_pop_count.to_csv("df_pop_count.csv")
    df_pop_pc.to_csv("df_pop_pc.csv")

## And a chart for the whole region showing much the same thing

In [None]:
df = None
for i, f in enumerate(files):
    co = f.stem.split("-")[0]
    print(co, " ", end="", flush=True)
    pop = pd.read_csv(f, usecols=["Pop"])
    pop["country"] = co
    if df is None:
        df = pop
    else:
        df = df.append(pop)

In [None]:
sns.set()
fig, ax = plt.subplots(figsize=(20, 10))
bins = [x / 2 for x in range(0, 17)]
plt.xticks(bins, [f"$10^{i:.0f}$" for i in bins])
ax.hist(np.log10(df.Pop), log=True, bins=bins, color="#99d8c9")
for label in ax.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)
ax.set_xlabel("Cluster size (people)", fontsize=20)
ax.set_ylabel("Count", fontsize=20)
ax.set_title("Cluster size distribution for Sub-Saharan Africa", fontsize=30)
ax.set_xlim([0, 7])
ax.set_ylim([1, 10**7])
ax.tick_params(axis="both", which="both", labelsize=14)
plt.show()