<a href="https://colab.research.google.com/github/ashikita/qir-toolbox/blob/main/wordcloud/make-wordcloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
リポジトリ開設20周年サイト向け：ワードクラウド背景 “完全版” （Jupyter/Colab最適化）

このスクリプトは、以下を一括で実現します。
 1) 日本語フォントの自動インストール/検出（Colab対応）と指定
 2) 日本語・英語混在タイトル（input.txt）からの形態素解析/トークン化
 3) デザイン戦略に基づく配色（パレット/カラーマップ/カスタム）
 4) バナー背景向けの推奨レイアウト（横書き寄り、語数調整、コントラスト制御）
 5) 透過PNG、エッジフェード、セーフテキスト帯など背景素材向けの後処理

―― デザイン戦略（背景用途のコツ / 実装ポイント）――
- 背景は“控えめ”が美しい：彩度/コントラストを少し落とす、色数を絞る → PALETTE / COLOR_MODE
- 横書き中心で整然と：prefer_horizontal を高く（= 1.0 に近く）
- 読ませたい前景テキストの可読性確保：
    - 透明度を下げる（ALPHA_ON_SAVE）
    - エッジをフェード（APPLY_EDGE_FADE）
    - セーフテキスト帯を敷く（ADD_SAFE_TITLE_BAND）
- サイズは用途に合わせる：
    - ヒーロー：2400x1200 / トップバナー：2400x800 / 細長帯：1600x300

使い方：
- 同じディレクトリに UTF-8 の input.txt（1行=1タイトル）を置いて実行してください。
- Colab の場合、input.txt が無ければアップロードを促します。

出力：
- wordcloud.png               … 基本版（背景色あり/なしを選択）
- wordcloud_transparent.png   … 透過版（TRANSPARENT_BG=True または ALPHA_ON_SAVE を適用）
- wordcloud_faded.png         … エッジフェード版（背景により馴染む）
- wordcloud_with_band.png     … セーフテキスト帯つき（上に大きな見出しを載せる前提）

依存：
- wordcloud, janome（任意/推奨）, tinysegmenter（任意）, pillow, numpy
- Colab では Noto CJK フォントを自動導入

"""

# ============================================================
# 設定ブロック（プロジェクトに合わせて調整してください）
# ============================================================

# 入出力
INPUT_FILE = "input.txt"
OUTPUT_PNG = "wordcloud.png"

# キャンバスサイズ（下の PRESET を使うと自動設定）
WC_WIDTH = 2400
WC_HEIGHT = 800

# プリセット：None / "hero" / "banner" / "strip"
#  - "hero"   = 2400 x 1200
#  - "banner" = 2400 x 800
#  - "strip"  = 1600 x 300
PRESET = "banner"

# 背景設定
TRANSPARENT_BG = False           # 透過背景（True推奨：背景素材として重ねやすい）
BACKGROUND_COLOR = "white"       # 透過しない場合の背景色
ALPHA_ON_SAVE = 200              # 0-255（背景素材として薄め推奨: 150〜220）。Noneで無効

# レイアウト/語数
MAX_WORDS = 200                  # 背景用途は少なめが上品（150〜250推奨）
RANDOM_STATE = 42
PREFER_HORIZONTAL = 0.95         # 横書き優先（背景用途は高め推奨）
MIN_TOKEN_LEN = 2
MIN_FREQUENCY = 2
NGRAM_N = 1                      # 1:ユニグラム, 2:バイグラム, 3:トライグラム

# 色付けモード："palette" / "colormap" / "custom"
COLOR_MODE = "palette"

# カラーマップ名（COLOR_MODE="colormap" のとき）
COLORMAP_NAME = "Set3"

# パレット（背景素材向けおすすめ例：ゴールド/ブルー/グレイ）
# ゴールド調（アニバーサリー感）
# PALETTE = ["#C9A86A", "#D3BC8D", "#E6D6B8", "#F4EEDD"]
# ブルー調（学術・コーポレート）
# PALETTE = ["#0D47A1", "#1565C0", "#1E88E5", "#64B5F6"]
# グレイ（上品モノトーン）
# PALETTE = ["#BDBDBD", "#D4D4D4", "#E0E0E0", "#EEEEEE"]
PALETTE = ["#C9A86A", "#D3BC8D", "#E6D6B8", "#F4EEDD"]  # デフォルト：ゴールド系

# エッジフェード（背景に馴染ませる）
APPLY_EDGE_FADE = True
EDGE_FADE_STRENGTH = 0.6         # 0〜1（0.5〜0.8 推奨）
EDGE_FADE_SHAPE = "radial"       # "radial"（円形） / 将来拡張用

# セーフテキスト帯（上に見出しを載せるための半透明帯）
ADD_SAFE_TITLE_BAND = True
BAND_HEIGHT_PX = 240
BAND_COLOR = "#FFFFFF"
BAND_ALPHA = 160
BAND_POSITION = "center"         # "top" / "center" / "bottom"

# フォント（"auto" 推奨：環境に応じて最適な日本語フォントを探索）
FONT_PATH = "auto"
FONT_PREFERRED_WEIGHT = "Light"  # "Light" / "Regular"（背景は細身推奨）

# 追加ストップワード（プロジェクト固有の汎用語をノイズ除去）
EXTRA_STOPWORDS = {
    # 例: "研究", "結果", "影響", "study", "result"
}

# ============================================================
# 依存と環境（Colab自動インストール）
# ============================================================

import os
import re
import sys
import unicodedata
import subprocess
from collections import Counter
from itertools import tee

# Colab 検出
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

def _ensure(cmd_list):
    try:
        subprocess.run(cmd_list, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except Exception:
        # サイレント失敗（ローカルJupyter等で権限なしの可能性を考慮）
        pass

# Colab では日本語フォントとライブラリを自動導入
if IN_COLAB:
    _ensure(["bash", "-lc", "apt-get -y update >/dev/null 2>&1 || true"])
    _ensure(["bash", "-lc", "apt-get -y install fonts-noto-cjk >/dev/null 2>&1 || true"])
    _ensure(["bash", "-lc", "pip -q install wordcloud janome tinysegmenter >/dev/null 2>&1 || true"])

# ライブラリ読み込み
import numpy as np
from PIL import Image, ImageDraw
import matplotlib as mpl
import matplotlib.font_manager as fm
from wordcloud import WordCloud

# 形態素（日本語）
try:
    from janome.tokenizer import Tokenizer as JanomeTokenizer
    _janome = JanomeTokenizer(wakati=False)
except Exception:
    _janome = None

try:
    from tinysegmenter import TinySegmenter
    _tinyseg = TinySegmenter()
except Exception:
    _tinyseg = None

# Colab の簡易アップロード
try:
    from google.colab import files  # type: ignore
    _colab_files = files
except Exception:
    _colab_files = None


# ============================================================
# ユーティリティ（正規化・トークン化・n-gram）
# ============================================================

_CJK_RE = re.compile(r"[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]")
_NUM_RE = re.compile(r"^[0-9]+$")
_SYM_RE = re.compile(r"^[_\W]+$", flags=re.UNICODE)
_EN_TOKEN_RE = re.compile(r"[A-Za-z][A-Za-z']+")

def normalize_text(text: str) -> str:
    t = unicodedata.normalize("NFKC", text)
    t = re.sub(r"[\u0000-\u001F\u007F\u200B-\u200F\u202A-\u202E]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def contains_cjk(text: str) -> bool:
    return bool(_CJK_RE.search(text))

def english_tokens(text: str):
    t = text.lower()
    return _EN_TOKEN_RE.findall(t)

def japanese_tokens(text: str):
    # Janome（推奨）：名詞/動詞/形容詞の基本形で収集
    if _janome is not None:
        toks = []
        for token in _janome.tokenize(text):
            pos = token.part_of_speech.split(",")[0]
            base = token.base_form if token.base_form != "*" else token.surface
            if pos in ("名詞", "動詞", "形容詞"):
                toks.append(base)
        return toks
    # TinySegmenter：フォールバック
    if _tinyseg is not None:
        return _tinyseg.tokenize(text)
    # さらなるフォールバック：CJK連続抽出
    return re.findall(r"[一-龥々〆ヵヶぁ-んァ-ヴー]+", text)

def make_ngrams(tokens, n=2):
    if n <= 1:
        return tokens
    iters = tee(tokens, n)
    for i, it in enumerate(iters):
        for _ in range(i):
            next(it, None)
    return ["_".join(t) for t in zip(*iters)]

def load_lines(path: str):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip()
            if line:
                yield line


# ============================================================
# ストップワード（英・日）
# ============================================================

EN_STOP = {
    "a","an","the","and","or","but","if","then","else","when","while","for","of","in","on","at",
    "to","from","by","with","without","about","across","after","before","during","within","between",
    "is","am","are","was","were","be","been","being","do","does","did","doing","have","has","had",
    "having","can","could","may","might","must","should","would","will","shall",
    "this","that","these","those","it","its","as","not","no","yes","we","you","he","she","they","them",
    "their","there","here","such","than","via","per","etc","i","our","us","your","yours","his","her",
    "into","out","over","under","again","new","based","using","use","used","study","analysis","results",
    "result","method","methods","approach","approaches","paper","article","report","case","cases",
    "effect","effects","evidence","evaluation","overview","review","reviews","systematic","meta",
    "toward","towards","impact","impacts","improving","improvement","improve",
    "model","models","modelling","modeling","data","dataset","datasets","large","small","novel",
    "among","across","more","most","less","least"
}

JA_STOP = {
    "こと","もの","ところ","ため","よう","られ","れる","ない","ある","いる","おり","なる","する","できる",
    "的","的な","的に","における","において","及び","および","ならびに",
    "これ","それ","あれ","どれ","ここ","そこ","あそこ","どこ","この","その","あの","どの",
    "そして","また","さらに","しかし","一方","など","等","等の","等を","等に","等で","等と",
    "例","場合","結果","研究","検討","報告","考察","課題","手法","方法","比較",
    "影響","実験","解析","分析","評価","事例","概要","総説","序論","序説","序","序文","序章",
    "新た","新しい","新規","一","二","三","四","五","六","七","八","九","十","第","年","月","日",
}

STOPWORDS = set(EN_STOP) | set(JA_STOP) | set(EXTRA_STOPWORDS)


# ============================================================
# フォント検出（日本語）："Light" を優先して背景向けの軽やかさを出す
# ============================================================

def find_japanese_font(preferred_weight: str = "Light") -> str | None:
    """
    候補：
      - Noto Sans CJK JP / Source Han Sans / IPA系 / Yu Gothic / Meiryo / Hiragino
    TTC/OTF/TTF の中から、希望ウェイトを優先して選択します。
    """
    if isinstance(FONT_PATH, str) and FONT_PATH not in (None, "", "auto"):
        return FONT_PATH if os.path.exists(FONT_PATH) else None

    # Colab の典型パス
    common_paths = [
        "/usr/share/fonts/opentype/noto/NotoSansCJKjp-Light.otf",
        "/usr/share/fonts/opentype/noto/NotoSansCJKjp-Regular.otf",
        "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
    ]
    for p in common_paths:
        if os.path.exists(p):
            return p

    # matplotlib font manager から探索
    candidates = [
        "Noto Sans CJK JP", "Noto Sans JP", "Source Han Sans", "Source Han Serif",
        "Noto Serif CJK JP", "IPAGothic", "IPAexGothic", "IPAMincho", "IPAexMincho",
        "Yu Gothic", "YuGothic", "Meiryo", "MS Gothic", "MS Mincho",
        "Hiragino Sans", "Hiragino Kaku Gothic ProN", "Hiragino Mincho ProN",
        "ヒラギノ角ゴ ProN W3"
    ]
    weight_keys = {
        "Light": ["light", "demilight", "ultralight", "extra light"],
        "Regular": ["regular", "book", "normal"]
    }
    weight_list = weight_keys.get(preferred_weight, []) + weight_keys["Regular"]

    # まず family 名で絞り、ウェイトを優先
    fonts = getattr(fm, "fontManager", None)
    ttflist = fonts.ttflist if fonts else []
    # 候補だけ抽出
    pool = [f for f in ttflist if any(c.lower() in getattr(f, "name","").lower() for c in candidates)]
    # ウェイト優先で並べ替え
    def score(fi):
        fname = os.path.basename(getattr(fi, "fname", "")).lower()
        for i, key in enumerate(weight_list):
            if key in fname:
                return i
        return 999
    pool.sort(key=score)
    for fi in pool:
        path = getattr(fi, "fname", "")
        if path and os.path.exists(path):
            return path

    # ファイル名ベース fallback
    sys_fonts = fm.findSystemFonts() if hasattr(fm, "findSystemFonts") else []
    for c in candidates:
        for p in sys_fonts:
            if c.replace(" ", "").lower() in os.path.basename(p).replace(" ", "").lower():
                return p if os.path.exists(p) else None
    return None


# ============================================================
# トークン化・頻度算出
# ============================================================

def preprocess_and_tokenize(line: str):
    text = normalize_text(line)
    en = english_tokens(text)
    ja = japanese_tokens(text) if contains_cjk(text) else []
    tokens = en + ja

    clean = []
    for tok in tokens:
        tok = tok.strip()
        if not tok:
            continue
        if _NUM_RE.fullmatch(tok):
            continue
        if _SYM_RE.fullmatch(tok):
            continue
        if len(tok) < MIN_TOKEN_LEN:
            continue
        clean.append(tok)
    return clean


# ============================================================
# 配色 color_func：palette / colormap / custom（日本語=青, 英語=橙 など）
# ============================================================

def make_palette_color_func(palette):
    def _cf(word, font_size, position, orientation, random_state=None, **kwargs):
        idx = abs(hash(word)) % len(palette)  # 安定割当（同語→同色）
        return palette[idx]
    return _cf

def make_colormap_color_func(cmap_name):
    cmap = mpl.cm.get_cmap(cmap_name)
    def _cf(word, font_size, position, orientation, random_state=None, **kwargs):
        r = abs(hash(word)) % 10_000
        color = cmap((r / 10_000))
        r_, g_, b_, _ = [int(255*x) for x in color]
        return f"rgb({r_}, {g_}, {b_})"
    return _cf

def make_custom_color_func(freq_counter):
    # 例：日本語（青系）/ 英語（オレンジ系）、頻度で濃淡
    ja_palette = ["#1565C0", "#1E88E5", "#42A5F5", "#90CAF9"]
    en_palette = ["#E65100", "#FB8C00", "#FFB74D", "#FFE0B2"]
    max_f = max(freq_counter.values()) if freq_counter else 1

    def _cf(word, font_size, position, orientation, random_state=None, **kwargs):
        palette = ja_palette if _CJK_RE.search(word) else en_palette
        f = freq_counter.get(word, 1)
        tier = int((1 - (f / max_f)) * (len(palette) - 1))  # 高頻度ほど濃い
        tier = max(0, min(tier, len(palette)-1))
        return palette[tier]
    return _cf


# ============================================================
# 後処理（透明化、エッジフェード、セーフテキスト帯）
# ============================================================

def ensure_rgba(img: Image.Image) -> Image.Image:
    return img.convert("RGBA") if img.mode != "RGBA" else img

def apply_uniform_alpha(img: Image.Image, alpha: int) -> Image.Image:
    """画像全体の透明度を一律に下げる（背景素材向け）。"""
    img = ensure_rgba(img)
    r, g, b, a = img.split()
    a = a.point(lambda v: v * (alpha / 255.0))
    return Image.merge("RGBA", (r, g, b, a))

def apply_edge_fade(img: Image.Image, strength: float = 0.6, shape: str = "radial") -> Image.Image:
    """四辺をフェードアウトさせ、周囲の要素と馴染ませる。strength: 0〜1"""
    img = ensure_rgba(img)
    w, h = img.size
    y, x = np.ogrid[:h, :w]
    cx, cy = w / 2.0, h / 2.0
    # 距離正規化（0中心→1端）
    dist = np.sqrt(((x - cx) / (w / 2))**2 + ((y - cy) / (h / 2))**2)
    dist = np.clip(dist, 0, 1)
    # エッジで α を下げるマスク
    # 1 - (dist^p)*strength で端を弱く
    p = 2.0
    mask = (1.0 - (dist ** p) * strength)
    mask = np.clip(mask, 0, 1)
    alpha = (mask * 255).astype(np.uint8)

    r, g, b, a = img.split()
    # 既存アルファと合成（小さい方を採用）
    a_np = np.minimum(np.array(a), alpha)
    a_new = Image.fromarray(a_np, mode="L")
    return Image.merge("RGBA", (r, g, b, a_new))

def add_safe_title_band(img: Image.Image, band_h: int = 240, color: str = "#FFFFFF",
                        alpha: int = 160, position: str = "center") -> Image.Image:
    """前景テキストの可読性を上げる半透明帯を敷く（top/center/bottom）。"""
    img = ensure_rgba(img)
    w, h = img.size
    y0, y1 = {
        "top": (0, band_h),
        "center": ((h - band_h)//2, (h + band_h)//2),
        "bottom": (h - band_h, h)
    }.get(position, ((h - band_h)//2, (h + band_h)//2))

    overlay = Image.new("RGBA", (w, h), (0, 0, 0, 0))
    draw = ImageDraw.Draw(overlay)
    # color(hex) + alpha を RGBA に
    color = color.lstrip("#")
    r = int(color[0:2], 16)
    g = int(color[2:4], 16)
    b = int(color[4:6], 16)
    draw.rectangle([0, y0, w, y1], fill=(r, g, b, alpha))
    return Image.alpha_composite(img, overlay)


# ============================================================
# メイン：頻度計算→WordCloud生成→後処理→保存
# ============================================================

def main():
    # プリセット適用
    global WC_WIDTH, WC_HEIGHT
    if PRESET == "hero":
        WC_WIDTH, WC_HEIGHT = 2400, 1200
    elif PRESET == "banner":
        WC_WIDTH, WC_HEIGHT = 2400, 800
    elif PRESET == "strip":
        WC_WIDTH, WC_HEIGHT = 1600, 300

    # 入力ファイルが無い場合、Colab ならアップロード促進
    if not os.path.exists(INPUT_FILE) and IN_COLAB and _colab_files is not None:
        print(f"'{INPUT_FILE}' が見つかりません。ダイアログからアップロードしてください。")
        uploaded = _colab_files.upload()
        if INPUT_FILE not in uploaded and uploaded:
            alt = next(iter(uploaded.keys()))
            os.rename(alt, INPUT_FILE)
            print(f"アップロードされた '{alt}' を '{INPUT_FILE}' として使用します。")

    if not os.path.exists(INPUT_FILE):
        raise FileNotFoundError(f"{INPUT_FILE} が見つかりません。パスを確認してください。")

    # 頻度集計
    freq = Counter()
    for line in load_lines(INPUT_FILE):
        tokens = preprocess_and_tokenize(line)
        if NGRAM_N and NGRAM_N > 1:
            tokens = make_ngrams(tokens, n=NGRAM_N)
        tokens = [t for t in tokens if t not in STOPWORDS]
        freq.update(tokens)

    if MIN_FREQUENCY > 1:
        freq = Counter({k: v for k, v in freq.items() if v >= MIN_FREQUENCY})

    if not freq:
        raise ValueError("有効なトークンが得られませんでした。STOPWORDS/MIN_TOKEN_LEN/MIN_FREQUENCY を見直してください。")

    # フォント決定
    font_path = find_japanese_font(preferred_weight=FONT_PREFERRED_WEIGHT)
    if font_path is None:
        print("警告: 日本語フォントを自動検出できませんでした。FONT_PATH に日本語フォントのパスを設定してください。")

    # color_func 決定
    if COLOR_MODE == "palette":
        color_func = make_palette_color_func(PALETTE)
    elif COLOR_MODE == "colormap":
        color_func = make_colormap_color_func(COLORMAP_NAME)
    elif COLOR_MODE == "custom":
        color_func = make_custom_color_func(freq)
    else:
        color_func = None

    # 透過背景の指定（WordCloud 自体の描画設定）
    wc_kwargs = dict(
        width=WC_WIDTH,
        height=WC_HEIGHT,
        max_words=MAX_WORDS,
        font_path=font_path,
        prefer_horizontal=PREFER_HORIZONTAL,
        random_state=RANDOM_STATE,
        collocations=False
    )
    if TRANSPARENT_BG:
        wc_kwargs.update(dict(mode="RGBA", background_color=None))
    else:
        wc_kwargs.update(dict(background_color=BACKGROUND_COLOR))

    wc = WordCloud(**wc_kwargs).generate_from_frequencies(freq)

    # 配色（レイアウトを固定したまま色だけ変更）
    if color_func is not None:
        wc.recolor(color_func=color_func, random_state=RANDOM_STATE)

    # 保存：基本版
    wc.to_file(OUTPUT_PNG)
    print(f"[OK] 保存: {OUTPUT_PNG}")

    # 画像ロード（後処理のため）
    base = Image.open(OUTPUT_PNG)
    base = ensure_rgba(base)

    # 透過版（TRANSPARENT_BG=False でも ALPHA_ON_SAVE が設定されていれば RGBA 出力）
    transparent = base.copy()
    if ALPHA_ON_SAVE is not None:
        transparent = apply_uniform_alpha(transparent, ALPHA_ON_SAVE)
    transparent.save("wordcloud_transparent.png")
    print(f"[OK] 保存: wordcloud_transparent.png")

    # エッジフェード版
    if APPLY_EDGE_FADE:
        faded = apply_edge_fade(transparent, strength=EDGE_FADE_STRENGTH, shape=EDGE_FADE_SHAPE)
        faded.save("wordcloud_faded.png")
        print(f"[OK] 保存: wordcloud_faded.png")
        working = faded
    else:
        working = transparent

    # セーフテキスト帯つき版（見出しを上に重ねる背景として最適）
    if ADD_SAFE_TITLE_BAND:
        banded = add_safe_title_band(
            working,
            band_h=BAND_HEIGHT_PX,
            color=BAND_COLOR,
            alpha=BAND_ALPHA,
            position=BAND_POSITION
        )
        banded.save("wordcloud_with_band.png")
        print(f"[OK] 保存: wordcloud_with_band.png")

    # トップ語表示
    print("\n上位20語:")
    for w, c in freq.most_common(20):
        print(f"{w}\t{c}")


if __name__ == "__main__":
    main()