# Setup

In [2]:
import sys
import os

from drc_names_corpus.core import get_report_path

sys.path.append(os.path.abspath(".."))

In [3]:
import matplotlib
matplotlib.use("Qt5Agg")

matplotlib.rcParams.update(
    {
        "pgf.texsystem": "pdflatex",
        "font.family": "serif",
        "text.usetex": False,
        "pgf.rcfonts": False,
    }
)

In [4]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [5]:
import scienceplots
plt.style.use(['science', 'ieee', 'no-latex', 'grid'])

In [6]:
textwidth = 3.31314
aspect_ratio = 6/8
scale = 1.0
width = textwidth * scale
height = width * aspect_ratio

# Name Analaysis - Reports

## Token count distribution

In [7]:
df = pl.read_csv(
    get_report_path("name_analysis", "token_count_distribution.csv"),
    null_values=[""]
).with_columns(
    pl.col("token_count").cast(pl.Int64)
)

df = df.sort("token_count")
pdf = df.to_pandas()

plt.figure(figsize=(width, height))
sns.barplot(
    data=pdf,
    x="token_count",
    y="count",
    color="#4C72B0"
)

plt.yscale("log")
#plt.title("Distribution of name token counts (log scale)")
plt.xlabel("Token count per name")
plt.ylabel("Frequency")

plt.show()
plt.savefig("figures/token_count_distribution.pgf", backend="pgf")

## Name length distribution

In [8]:
df = pl.read_csv(
    get_report_path("name_analysis", "name_length_distribution.csv"),
    null_values=[""]
).with_columns(
    pl.col("char_len").cast(pl.Int64)
)

df = df.sort("char_len")
pdf = df.to_pandas()

plt.figure(figsize=(width, height))
ax = sns.barplot(
    data=pdf,
    x="char_len",
    y="count",
    color="#4C72B0"
)

plt.yscale("log")
plt.xlabel("Number of Characters per Name")
plt.ylabel("Count")
#plt.title("Character Length Distribution of Names")

ticks = np.arange(0, len(pdf), 5)
ax.set_xticks(ticks)
ax.set_xticklabels(pdf["char_len"].iloc[ticks], ha="right")

plt.show()
plt.savefig("figures/name_length_distribution.pgf")

## Distribution by province

In [9]:
df = pl.read_csv(
    get_report_path("name_analysis", "diversity_by_province.csv")
).select(
    pl.col("province"),
    pl.col("shannon").cast(pl.Float64),
    pl.col("effective_names").cast(pl.Float64),
)
pdf = df.to_pandas()

plt.figure(figsize=(width, height))
sns.barplot(
    data=pdf.sort_values("effective_names", ascending=False),
    x="province",
    y="effective_names",
)

plt.yscale("log")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count")
plt.xlabel("Province")

plt.show()
plt.savefig("figures/diversity_by_province.pgf", backend="pgf")

## Distribution by year

In [None]:
df = pl.read_csv(
    get_report_path("name_analysis", "diversity_by_year.csv")
).select(
    pl.col("year"),
    pl.col("shannon").cast(pl.Float64),
    pl.col("effective_names").cast(pl.Float64),
)
pdf = df.to_pandas()

plt.figure(figsize=(width, height))
sns.barplot(
    data=pdf.sort_values("effective_names", ascending=False),
    x="year",
    y="effective_names",
)

plt.yscale("log")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count")
plt.xlabel("Province")

plt.show()
plt.savefig("figures/diversity_by_year.pgf", backend="pgf")

: 