<a href="https://colab.research.google.com/github/ashikita/qir-toolbox/blob/main/make-wordcloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================================
# セル2: ワードクラウド生成（Colab最適化版）
# ================================

# ===== 設定ブロック（必要に応じて変更してください） =====
INPUT_FILE = "input.txt"             # 1行に1タイトルが入ったUTF-8テキスト
OUTPUT_PNG = "wordcloud.png"         # 出力画像（PNG）
WC_WIDTH = 2000                      # 出力画像の幅（px）
WC_HEIGHT = 1400                     # 出力画像の高さ（px）
BACKGROUND_COLOR = "white"           # 背景色（white / black など）
MAX_WORDS = 500                      # 出力する単語の最大数
RANDOM_STATE = 42                    # レイアウトの乱数シード
MIN_TOKEN_LEN = 2                    # 最小トークン長（英日とも）
MIN_FREQUENCY = 2                    # 出現回数がこの値未満の語を除外
NGRAM_N = 1                          # 1:ユニグラム, 2:バイグラム, 3:トライグラム
EXTRA_STOPWORDS = {
    # 追加で除外したい語をここに列挙（英語・日本語混在可）
    # 例: "study", "result", "結果", "研究", "日本"
}
# "auto": 自動検出（推奨）。固定したい場合はフォントパスを文字列で指定。
FONT_PATH = "auto"
# ===============================================

import os
import re
import sys
import unicodedata
from collections import Counter
from itertools import tee

# Colab でのファイルアップロード補助
try:
    from google.colab import files  # type: ignore
    IN_COLAB = True
except Exception:
    files = None
    IN_COLAB = False

# 依存ライブラリ
from wordcloud import WordCloud
import matplotlib.font_manager as fm

# Janome（推奨：日本語の高精度トークナイズ）
try:
    from janome.tokenizer import Tokenizer as JanomeTokenizer
    _janome = JanomeTokenizer(wakati=False)
except Exception:
    _janome = None

# TinySegmenter（フォールバック用）
try:
    from tinysegmenter import TinySegmenter
    _tinyseg = TinySegmenter()
except Exception:
    _tinyseg = None

# 画像表示用（任意）
from IPython.display import display
from PIL import Image


# ========== ユーティリティ ==========
_CJK_RE = re.compile(r"[\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF]")
_NUM_RE = re.compile(r"^[0-9]+$")
_SYM_RE = re.compile(r"^[_\W]+$", flags=re.UNICODE)
_EN_TOKEN_RE = re.compile(r"[A-Za-z][A-Za-z']+")

def normalize_text(text: str) -> str:
    """Unicode正規化（NFKC）と制御文字の簡易除去。"""
    t = unicodedata.normalize("NFKC", text)
    t = re.sub(r"[\u0000-\u001F\u007F\u200B-\u200F\u202A-\u202E]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def contains_cjk(text: str) -> bool:
    return bool(_CJK_RE.search(text))

def english_tokens(text: str):
    t = text.lower()
    return _EN_TOKEN_RE.findall(t)

def japanese_tokens(text: str):
    # Janome がある場合：名詞/動詞/形容詞の基本形を収集（語彙の正規化）
    if _janome is not None:
        toks = []
        for token in _janome.tokenize(text):
            pos = token.part_of_speech.split(",")[0]  # 先頭品詞
            base = token.base_form if token.base_form != "*" else token.surface
            if pos in ("名詞", "動詞", "形容詞"):
                toks.append(base)
        return toks

    # TinySegmenter がある場合：単純分割
    if _tinyseg is not None:
        return _tinyseg.tokenize(text)

    # さらにフォールバック：漢字/かな/カナの連続
    return re.findall(r"[一-龥々〆ヵヶぁ-んァ-ヴー]+", text)

def make_ngrams(tokens, n=2):
    """n-gram のタプルを 'token1_token2' の文字列へ。"""
    if n <= 1:
        return tokens
    iters = tee(tokens, n)
    for i, it in enumerate(iters):
        for _ in range(i):
            next(it, None)
    return ["_".join(t) for t in zip(*iters)]

def load_lines(path: str):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip()
            if line:
                yield line


# ========== ストップワード（英・日） ==========
EN_STOP = {
    "a","an","the","and","or","but","if","then","else","when","while","for","of","in","on","at",
    "to","from","by","with","without","about","across","after","before","during","within","between",
    "is","am","are","was","were","be","been","being","do","does","did","doing","have","has","had",
    "having","can","could","may","might","must","should","would","will","shall",
    "this","that","these","those","it","its","as","not","no","yes","we","you","he","she","they","them",
    "their","there","here","such","than","via","per","etc","i","our","us","your","yours","his","her",
    "into","out","over","under","again","new","based","using","use","used","study","analysis","results",
    "result","method","methods","approach","approaches","paper","article","report","case","cases",
    "effect","effects","evidence","evaluation","overview","review","reviews","systematic","meta",
    "toward","towards","impact","impacts","improving","improvement","improve",
    "model","models","modelling","modeling","data","dataset","datasets","large","small","novel",
    "among","across","more","most","less","least"
}

JA_STOP = {
    "こと","もの","ところ","ため","よう","られ","れる","ない","ある","いる","おり","なる","する","できる",
    "的","的な","的に","における","において","及び","および","ならびに",
    "これ","それ","あれ","どれ","ここ","そこ","あそこ","どこ","この","その","あの","どの",
    "そして","また","さらに","しかし","一方","など","等","等の","等を","等に","等で","等と",
    "例","場合","結果","研究","検討","報告","考察","課題","手法","方法","比較",
    "影響","実験","解析","分析","評価","事例","概要","総説","序論","序説","序","序文","序章",
    "新た","新しい","新規","一","二","三","四","五","六","七","八","九","十","第","年","月","日",
}

STOPWORDS = set(EN_STOP) | set(JA_STOP) | set(EXTRA_STOPWORDS)


# ========== フォント自動検出（Colab向け） ==========
def find_noto_cjk_font() -> str | None:
    """Colab にインストールされた NotoSansCJK の TTC/OTF を探す。"""
    if isinstance(FONT_PATH, str) and FONT_PATH not in (None, "", "auto"):
        return FONT_PATH if os.path.exists(FONT_PATH) else None

    # よくあるパス（fonts-noto-cjk パッケージ）
    common_paths = [
        "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
        "/usr/share/fonts/opentype/noto/NotoSansCJKjp-Regular.otf",
        "/usr/share/fonts/opentype/noto/NotoSansCJKjp-Regular.ttc",
        "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
    ]
    for p in common_paths:
        if os.path.exists(p):
            return p

    # font_manager から検索
    names = ("NotoSansCJK", "Noto Sans CJK", "Noto Sans CJK JP", "NotoSansCJKjp")
    for f in fm.fontManager.ttflist:
        fname = getattr(f, "fname", "")
        if not fname:
            continue
        base = os.path.basename(fname)
        if any(n.replace(" ", "") in base.replace(" ", "") for n in names):
            if os.path.exists(fname):
                return fname

    return None


# ========== 前処理 & トークン化 ==========
def preprocess_and_tokenize(line: str):
    """1行（1タイトル）から英語・日本語トークンを抽出し結合。"""
    text = normalize_text(line)

    # 英語トークン
    en = english_tokens(text)

    # 日本語トークン
    ja = japanese_tokens(text) if contains_cjk(text) else []

    tokens = en + ja

    # 記号・数字のみや短すぎる語を除外
    clean = []
    for tok in tokens:
        tok = tok.strip()
        if not tok:
            continue
        if _NUM_RE.fullmatch(tok):
            continue
        if _SYM_RE.fullmatch(tok):
            continue
        if len(tok) < MIN_TOKEN_LEN:
            continue
        clean.append(tok)

    return clean


# ========== メイン処理 ==========
def main():
    # 入力が無ければ Colab のアップロードダイアログを開く
    if not os.path.exists(INPUT_FILE) and IN_COLAB and files is not None:
        print(f"'{INPUT_FILE}' が見つかりません。ダイアログからアップロードしてください。")
        uploaded = files.upload()
        # アップロード名が異なる場合は最初のファイルを INPUT_FILE として扱う
        if INPUT_FILE not in uploaded:
            # 最初のキーを採用
            alt = next(iter(uploaded.keys()))
            os.rename(alt, INPUT_FILE)
            print(f"アップロードされた '{alt}' を '{INPUT_FILE}' として使用します。")

    if not os.path.exists(INPUT_FILE):
        raise FileNotFoundError(f"{INPUT_FILE} が見つかりません。パスを確認してください。")

    freq = Counter()

    # 読み込み & トークン化
    for line in load_lines(INPUT_FILE):
        tokens = preprocess_and_tokenize(line)
        if NGRAM_N and NGRAM_N > 1:
            tokens = make_ngrams(tokens, n=NGRAM_N)
        tokens = [t for t in tokens if t not in STOPWORDS]
        freq.update(tokens)

    # 最低頻度フィルタ
    if MIN_FREQUENCY > 1:
        freq = Counter({k: v for k, v in freq.items() if v >= MIN_FREQUENCY})

    if not freq:
        raise ValueError("有効なトークンが得られませんでした。STOPWORDS, MIN_TOKEN_LEN, MIN_FREQUENCY を見直してください。")

    # フォント決定
    font_path = find_noto_cjk_font()
    if font_path is None:
        print(
            "警告: 日本語フォントを自動検出できませんでした。"
            " 'FONT_PATH' に日本語フォント(TTC/TTF/OTF)のパスを指定してください。"
        )

    # WordCloud 生成
    wc = WordCloud(
        width=WC_WIDTH,
        height=WC_HEIGHT,
        background_color=BACKGROUND_COLOR,
        max_words=MAX_WORDS,
        font_path=font_path,        # 日本語表示に重要
        prefer_horizontal=0.9,
        random_state=RANDOM_STATE,
        collocations=False,         # n-gramは自前で生成する想定
        regexp=None
    ).generate_from_frequencies(freq)

    # 画像保存（PNG）
    wc.to_file(OUTPUT_PNG)

    # 概要表示
    top_items = freq.most_common(20)
    print(f"出力: {OUTPUT_PNG}")
    print(f"語彙数: {len(freq)} / 最大語数: {MAX_WORDS}")
    print("上位20語:")
    for w, c in top_items:
        print(f"{w}\t{c}")

    # Colab 上でプレビュー
    try:
        display(Image.open(OUTPUT_PNG))
    except Exception:
        pass


if __name__ == "__main__":
    main()