In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re
import numpy as np


base = Path("/home/njian29/Desktop")
FILES = {
    "sports": base / "filename_dates_parsed_history_of_sports_tagged.csv",
    "ideologies": base / "filename_dates_parsed_history_of_ideologies_tagged.csv",
    "objects": base / "filename_dates_parsed_historical_objects_tagged.csv",
}

TIME_WINDOWS = [
    (1917, 1920),  
    (1939, 1945),
    (1989, 1992),
    (1861, 1865),
    (1789, 1799),
]

def parse_years(cell):
    if pd.isna(cell):
        return []
    return list(map(int, re.findall(r"-?\d{3,4}", str(cell))))

def count_mentions(df, start, end):
    counts = 0
    langs = 0
    for _, row in df.iterrows():
        years = parse_years(row["parsed_years"])
        in_range = [y for y in years if start <= y <= end]
        if in_range:
            counts += len(in_range)
            langs += 1
    return langs, counts

summary = []
for cat, path in FILES.items():
    df = pd.read_csv(path)
    for (start, end) in TIME_WINDOWS:
        langs, mentions = count_mentions(df, start, end)
        summary.append({
            "category": cat,
            "start": start,
            "end": end,
            "langs": langs,
            "mentions": mentions,
        })

summary_df = pd.DataFrame(summary)

fig, ax = plt.subplots(figsize=(14, 7))

colors = {"sports": "skyblue", "ideologies": "salmon", "objects": "lightgreen"}
categories = list(FILES.keys())
n_cat = len(categories)
bar_width_factor = 0.001

x = np.arange(len(TIME_WINDOWS))
bar_spacing = 0.3           

for i, cat in enumerate(categories):
    subset = summary_df[summary_df["category"] == cat].reset_index(drop=True)
    bar_positions = x + (i - (n_cat - 1) / 2) * bar_spacing
    bars = ax.bar(
        bar_positions,
        subset["mentions"],
        width=subset["langs"] * bar_width_factor,
        label=cat,
        color=colors[cat],
        alpha=0.7,
    )
    for j, bar in enumerate(bars):
        height = bar.get_height()
        langs = subset.loc[j, "langs"]
        mentions = subset.loc[j, "mentions"]
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            height + max(summary_df["mentions"]) * 0.01,
            f"{langs} langs\n{mentions} mentions",
            ha="center", va="bottom", fontsize=8, rotation=0
        )

labels = [f"{start}-{end}" for (start, end) in TIME_WINDOWS]
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha="right")

ax.set_ylabel("Total Mentions (height)")
ax.set_title("Mentions across Time Windows by Category (Side-by-Side)")
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

files = {
    "sports": Path.home() / "Desktop" / "filename_wordcounts_sports.csv",
    "objects": Path.home() / "Desktop" / "filename_wordcounts_objects.csv",
    "ideologies": Path.home() / "Desktop" / "filename_wordcounts_ideologies.csv",
}

data = []
for category, path in files.items():
    df = pd.read_csv(path)
    df["language"] = df["filename"].str.replace(".txt", "", regex=False)
    df["category"] = category
    data.append(df[["language", "english_word_count", "category"]])

all_df = pd.concat(data, ignore_index=True)

all_df["log_wordcount"] = np.log10(all_df["english_word_count"].replace(0, np.nan))

for category, group in all_df.groupby("category"):
    group_sorted = group.sort_values("log_wordcount", ascending=True)
    fig_h = max(6, len(group_sorted) * 0.25)
    plt.figure(figsize=(10, fig_h))

    plt.barh(
        group_sorted["language"],
        group_sorted["log_wordcount"],
        color="steelblue",
        edgecolor="black"
    )

    plt.xlabel("Log10(Word Count)", fontsize=12)
    plt.ylabel("Language", fontsize=12)
    plt.title(f"Word Counts per Language ({category.capitalize()})", fontsize=14)

    plt.tight_layout()

    out_path = Path.home() / "Desktop" / f"barh_wordcount_{category}.png"
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[OK] 已保存 {out_path}")