In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

In [None]:
sys.path.append(os.path.abspath(".."))

from ners.core.config import setup_config
from ners.core.utils.data_loader import DataLoader
from ners.core.config.pipeline_config import PipelineConfig

In [None]:
config = setup_config()
loader = DataLoader(config)

In [None]:
gdf = gpd.read_file("../../assets/osm/provinces.shp")
gdf_proj = gdf.to_crs(epsg=32732)
gdf["centroid"] = gdf_proj.geometry.centroid.to_crs(gdf.crs)

df = loader.load_csv_complete(config.paths.data_dir / "names_featured.csv")

## Breakdown by Province

In [None]:
import unicodedata


# Helper function for cleaning province names
def clean_province(s):
    return (
        s.str.upper()
        .str.strip()
        .apply(
            lambda x: unicodedata.normalize("NFKD", x)
            .encode("ascii", errors="ignore")
            .decode("utf-8")
            if isinstance(x, str)
            else x
        )
    )


# Apply to both DataFrames
df["province"] = clean_province(df["province"])
gdf["province"] = clean_province(gdf["name"])

# Group and merge
counts = df["province"].value_counts().reset_index()
counts.columns = ["province", "count"]

gdf = gdf.merge(counts, left_on="province", right_on="province", how="left")

In [None]:
gdf.head(12)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

gdf.plot(column="count", cmap="Blues", edgecolor="black", legend=True, ax=ax)
labels = gdf.loc[gdf["count"].notna(), ["province", "centroid"]]

for _, row in labels.iterrows():
    ax.annotate(
        text=row["province"],
        xy=(row["centroid"].x, row["centroid"].y),
        ha="center",
        va="center",
        fontsize=8,
        color="black",
    )

ax.axis("off")
plt.show()

## Distribution

In [None]:
provinces = df["province"].value_counts()
plt.figure(figsize=(7, 7))
plt.pie(provinces, labels=provinces.index, autopct="%1.1f%%", startangle=140)
plt.axis("equal")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
provinces.plot(kind="bar")
plt.xlabel("Province")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.show()
data = df.groupby(["province", "sex"], observed=False).size().reset_index(name="count")
data = data.sort_values("count", ascending=False).drop_duplicates(["province", "sex"])

plt.figure(figsize=(12, 6))
sns.barplot(data=data, x="province", y="count", hue="sex")

plt.xlabel("Region")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title="Sex")
plt.tight_layout()
plt.show()

In [None]:
sex = df["sex"].value_counts()
plt.figure(figsize=(7, 7))
plt.pie(sex, labels=sex.index, autopct="%1.1f%%", startangle=140)
plt.axis("equal")
plt.show()

In [None]:
df_provinces_sex_counts = (
    df.groupby(["province", "sex"], observed=False).size().reset_index(name="count")
)
provinces = df_provinces_sex_counts["province"].unique()
cols = 3
rows = (len(provinces) + cols - 1) // cols

plt.figure(figsize=(cols * 5, rows * 5))
for i, province in enumerate(provinces, 1):
    data = df_provinces_sex_counts[df_provinces_sex_counts["province"] == province]
    plt.subplot(rows, cols, i)
    plt.pie(data["count"], labels=data["sex"], autopct="%1.1f%%")
    plt.title(province)
    plt.axis("equal")

plt.tight_layout()
plt.show()

## Names

In [None]:
# breakdown of names into surname, middle name and first name
# retrieve all rows whose name is identified as simple, and delete rows whose identified_name is null or empty

df_simple = df[df["identified_category"] == "simple"].copy()
df_simple["surname"] = df_simple["identified_name"].str.split().str[0]
df_simple["middle_name"] = df_simple["identified_name"].str.split().str[1]
df_simple["first_name"] = df_simple["identified_surname"]
df_simple = df_simple[
    df_simple["first_name"].notna() & (df_simple["first_name"].str.strip() != "")
]

df_simple[["name", "surname", "middle_name", "first_name"]].head()

In [None]:
# Simple distribution of first names (display of the 10 most popular first names)
sns.countplot(
    y=df_simple["first_name"],
    order=df_simple["first_name"].value_counts().iloc[:10].index,
)
plt.title("Top 10 first names (simple names only)")
plt.xlabel("Count")
plt.ylabel("First name")
plt.tight_layout()
plt.show()

In [None]:
firstnames = df_simple["first_name"].value_counts()
df_firstnames = firstnames.reset_index()
df_firstnames.columns = ["First name", "quantity"]
df_firstnames.head(10)

In [None]:
# Most common first name by province
top_names = (
    df_simple.groupby(["province_clean", "first_name"]).size().reset_index(name="count")
)
top_names = top_names.sort_values(["province_clean", "count"], ascending=[True, False])
top_names = top_names.drop_duplicates(subset="province_clean", keep="first")
gdf_named = gdf_merged.merge(
    top_names[["province_clean", "first_name"]],
    left_on="province_upper",
    right_on="province_clean",
    how="left",
)

fig, ax = plt.subplots(figsize=(12, 12))
gdf_named.plot(column="count", cmap="Blues", edgecolor="black", legend=True, ax=ax)

for idx, row in gdf_named.iterrows():
    if pd.notna(row["first_name"]):
        centroid = row["geometry"].centroid
        label = f"{row['first_name']}"
        plt.text(
            centroid.x,
            centroid.y,
            label,
            horizontalalignment="center",
            fontsize=8,
            color="black",
        )

plt.title("Most common first name by province")
plt.axis("off")
plt.show()

In [None]:
# Total number of distinct first names
total_firstnames = df_simple["first_name"].notna().sum()
print(f">> Total number of first names registered : {total_firstnames}")

# categorization of first names
firstnames_by_sex = (
    df_simple.groupby(["first_name", "sex"]).size().unstack(fill_value=0)
)
male_firstnames = firstnames_by_sex[
    (firstnames_by_sex["m"] > 0) & (firstnames_by_sex["f"] == 0)
]
print(f">> Number of all-male first names : {len(male_firstnames)}")
female_firstnames = firstnames_by_sex[
    (firstnames_by_sex["f"] > 0) & (firstnames_by_sex["m"] == 0)
]
print(f">> Number of all-female first names : {len(female_firstnames)}")
neutral_firstnames = firstnames_by_sex[
    (firstnames_by_sex["m"] > 0) & (firstnames_by_sex["f"] > 0)
]
print(f">> Number of all-neutral first names : {len(neutral_firstnames)}")

In [None]:
# data for graphic interpretation
labels = ["Male", "Female", "Neutral"]
values = [len(male_firstnames), len(female_firstnames), len(neutral_firstnames)]

# circular
plt.figure(figsize=(6, 6))
plt.pie(
    values,
    labels=labels,
    autopct="%1.1f%%",
    startangle=140,
    colors=["skyblue", "lightgreen", "lightgray"],
)
plt.title("Distribution of first names by sex")
plt.axis("equal")
plt.show()

In [None]:
# Assign first name sex category
df_simple["first_name_sex_category"] = df_simple["first_name"].map(
    lambda x: "Male"
    if x in male_firstnames
    else (
        "Female"
        if x in female_firstnames
        else ("Neutral" if x in neutral_firstnames else None)
    )
)

df_cat = df_simple.dropna(subset=["first_name_sex_category"])
category_counts = (
    df_cat.groupby(["province_clean", "first_name_sex_category"])
    .size()
    .unstack(fill_value=0)
)
category_counts["dominant_category"] = category_counts.idxmax(axis=1)
map_df = gdf.set_index("province_upper").join(category_counts["dominant_category"])

color_map = {"Male": "skyblue", "Female": "lightgreen", "Neutral": "lightgray"}
map_df["color"] = map_df["dominant_category"].map(color_map)

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
map_df.plot(color=map_df["color"], edgecolor="black", ax=ax)
for idx, row in map_df.iterrows():
    plt.annotate(
        text=row["dominant_category"],
        xy=(row.geometry.centroid.x, row.geometry.centroid.y),
        horizontalalignment="center",
        fontsize=8,
        color="black",
    )

plt.title("Dominant first name category by province")
plt.axis("off")
plt.show()

In [None]:
# Histogram
plt.figure(figsize=(6, 4))
plt.bar(labels, values, color=["skyblue", "lightpink", "lightgray"])
plt.title("Distribution of first names by sex")
plt.ylabel("Number of first names")
plt.show()

In [None]:
# Count of unique first names per region (province)
distinct_regions = df_simple["province"].nunique()
print(f">> Number of distinct regions: {distinct_regions}")
firstnames_by_region = (
    df_simple.groupby("province")["first_name"].nunique().sort_values(ascending=False)
)

print("Number of unique first names per region")
df_firstnames_region = firstnames_by_region.reset_index()
df_firstnames_region.columns = ["Region", "Unique first names"]
df_firstnames_region.head(12)

In [None]:
unique_first_names = (
    df_simple.groupby("province")["first_name"]
    .nunique()
    .reset_index(name="unique_count")
)
unique_first_names["province_clean"] = (
    unique_first_names["province"]
    .str.upper()
    .str.strip()
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
)
gdf_diversity = gdf_merged.merge(
    unique_first_names[["province_clean", "unique_count"]],
    on="province_clean",
    how="left",
)

fig, ax = plt.subplots(figsize=(10, 10))
gdf_diversity.plot(
    column="unique_count", cmap="viridis", edgecolor="black", legend=True, ax=ax
)

for idx, row in gdf_diversity.iterrows():
    centroid = row["geometry"].centroid
    plt.text(
        centroid.x,
        centroid.y,
        row["province_upper"],
        horizontalalignment="center",
        fontsize=8,
        color="black",
    )

plt.title("First name diversity by province")
plt.axis("off")
plt.show()

In [None]:
# surnames per region (aggregated)
top_surnames_region = (
    surnames_by_region.sort_values("count", ascending=False)
    .drop_duplicates("province")
    .head(12)
)
plt.figure(figsize=(10, 6))
sns.barplot(
    y="province", x="count", data=top_surnames_region, hue="surname", dodge=False
)
plt.title("Most Common Surname per Region")
plt.xlabel("Count")
plt.ylabel("Region")
plt.legend(title="Surname", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
# Map of the most common surname by province in DRC
top_surnames = (
    df_simple.groupby(["province", "surname"]).size().reset_index(name="count")
)
top_surnames = top_surnames.sort_values(["province", "count"], ascending=[True, False])
top_surnames = top_surnames.drop_duplicates(subset="province", keep="first")
top_surnames["province_clean"] = (
    top_surnames["province"]
    .str.upper()
    .str.strip()
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
)

gdf_named = gdf_merged.merge(
    top_surnames[["province_clean", "surname"]], on="province_clean", how="left"
)

fig, ax = plt.subplots(figsize=(12, 12))
gdf_named.plot(column="count", cmap="PuBu", edgecolor="black", legend=True, ax=ax)

for idx, row in gdf_named.iterrows():
    if pd.notna(row["surname"]):
        centroid = row["geometry"].centroid
        plt.text(
            centroid.x,
            centroid.y,
            row["surname"],
            horizontalalignment="center",
            fontsize=8,
            color="black",
        )

plt.title("Most common surname by province")
plt.axis("off")
plt.show()

In [None]:
# most popular middle name per region
top_middlename_region = (
    middlenames_by_region.sort_values("count", ascending=False)
    .drop_duplicates("province")
    .head(12)
)
plt.figure(figsize=(10, 6))
sns.barplot(
    y="province", x="count", data=top_middlename_region, hue="middle_name", dodge=False
)
plt.title("Most Common Middle Name per Region")
plt.xlabel("Count")
plt.ylabel("Region")
plt.legend(title="Middle Name", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
# Map of the most common middle by province in DRC
top_surnames = (
    df_simple.groupby(["province", "middle_name"]).size().reset_index(name="count")
)
top_surnames = top_surnames.sort_values(["province", "count"], ascending=[True, False])
top_surnames = top_surnames.drop_duplicates(subset="province", keep="first")
top_surnames["province_clean"] = (
    top_surnames["province"]
    .str.upper()
    .str.strip()
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
)

gdf_named = gdf_merged.merge(
    top_surnames[["province_clean", "middle_name"]], on="province_clean", how="left"
)

fig, ax = plt.subplots(figsize=(12, 12))
gdf_named.plot(column="count", cmap="PuBu", edgecolor="black", legend=True, ax=ax)

for idx, row in gdf_named.iterrows():
    if pd.notna(row["middle_name"]):
        centroid = row["geometry"].centroid
        plt.text(
            centroid.x,
            centroid.y,
            row["middle_name"],
            horizontalalignment="center",
            fontsize=8,
            color="black",
        )

plt.title("Most common middle name by province")
plt.axis("off")
plt.show()