### 한일 기후변화비교 연구
#### Step 2. 토픽 모델링

In [2]:
import os
import re
import gc
import json
import glob
import traceback
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product

from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# ---------- PATHS ----------
BASE_DIR = "E:/Data_for_Practice/JapMedia/"
DATA_DIR = os.path.join(BASE_DIR, "data/kor_data")
TEXT_COL = "본문_en"

# ---------- CONFIG ----------
SEED = 42
EMBEDDING_NAME = "BAAI/bge-large-en-v1.5"
MIN_TOKENS_AFTER_CLEAN = 10
NR_TOPICS_FIXED = 5

MIN_DF_FIXED = 0.01
MAX_DF_FIXED = 0.90

TOPN_WORDS_FOR_DIVERSITY = 10
ALPHA_DIVERSITY = 0.6
BETA_COHESION = 0.4

ALLOWED_NGRAM = [(1,1), (1,2)]
ALLOWED_N_NEIGHBORS = [10, 15]
ALLOWED_MIN_DIST    = [0.1, 0.2]
ALLOWED_MIN_CLUSTER = [15, 25, 35]

np.random.seed(SEED)

# ---------- Device ----------
def pick_device_for_st():
    try:
        import torch
        return "cuda" if torch.cuda.is_available() else "cpu"
    except Exception:
        return "cpu"

DEVICE = pick_device_for_st()
print(f"[INFO] Using device: {DEVICE}")

# ---------- Stopwords ----------
NEWS_STOPWORDS_EXTRA = {
    "said","will","one","two","new","year","years","percent","also","could","would",
    "mr","ms","u","korea","south","seoul","korean","however","among","may","many",
    "made","make","like","since","according","including","told","say","says",
    "first","last","day","days","week","weeks","month","months","today","yesterday","tomorrow",
    "time","times","report","reports","reported","yonhap","reuters","ap",
    "people","company","companies","government","official","officials",
    "article","news","daily","kim","park","lee","cho","jang","chung","moon","yoon"
}
all_stop_words = set(ENGLISH_STOP_WORDS).union(NEWS_STOPWORDS_EXTRA)

# ---------- Preprocessing ----------
import spacy
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
except Exception:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def clean_text_lemma(t: str, stopwords: set) -> str:
    t = str(t).lower()
    t = re.sub(r"(https?://\S+|www\.\S+)", " ", t)
    t = re.sub(r"\b\d+\b", " ", t)
    t = re.sub(r"[^a-z\s\-]", " ", t)
    t = re.sub(r"\s{2,}", " ", t).strip()
    doc = nlp(t)
    lemmas = [
        token.lemma_ for token in doc
        if len(token.lemma_) > 2 and token.lemma_ not in stopwords and token.is_alpha
    ]
    return " ".join(lemmas)

# ---------- Embedding ----------
embedding_model = SentenceTransformer(EMBEDDING_NAME, device=DEVICE)
def encode_texts(texts):
    return embedding_model.encode(
        texts,
        batch_size=48 if DEVICE == "cpu" else 128,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=DEVICE
    )

# ---------- Scoring ----------
def topic_diversity(topic_model, topn=TOPN_WORDS_FOR_DIVERSITY):
    info = topic_model.get_topic_info()
    topic_ids = [t for t in info["Topic"].tolist() if t != -1]
    words = []
    for t in topic_ids:
        reps = topic_model.get_topic(t)
        if reps:
            words.extend([w for (w, _) in reps[:topn]])
    return len(set(words)) / (len(set(topic_ids)) * topn) if topic_ids else 0.0

def topic_cohesion(embeddings, labels):
    labels = np.array(labels)
    mask = labels != -1
    if mask.sum() < 2:
        return 0.0
    emb = embeddings[mask]
    lab = labels[mask]
    sims = []
    for t in np.unique(lab):
        idx = np.where(lab == t)[0]
        if len(idx) > 1:
            sub = emb[idx]
            centroid = np.mean(sub, axis=0, keepdims=True)
            sims.extend(cosine_similarity(sub, centroid).ravel())
    return np.mean(sims) if sims else 0.0

def composite_score(model, emb, labels, alpha=ALPHA_DIVERSITY, beta=BETA_COHESION):
    td = topic_diversity(model)
    coh = topic_cohesion(emb, labels)
    return alpha * td + beta * coh, td, coh

# ---------- Fit function ----------
def fit_evaluate(params, docs, emb):
    if len(docs) < 5: return -9999, 0, 0, 0, None
    vec = CountVectorizer(stop_words=list(all_stop_words),
                          ngram_range=params["n_gram_range"],
                          min_df=MIN_DF_FIXED, max_df=MAX_DF_FIXED)
    umap_m = UMAP(n_neighbors=params["n_neighbors"], min_dist=params["min_dist"],
                  n_components=10, metric="cosine", random_state=SEED)
    hdb = HDBSCAN(min_cluster_size=params["min_cluster_size"], min_samples=5,
                  metric="euclidean", cluster_selection_method="eom", prediction_data=True)
    ctfidf = ClassTfidfTransformer(reduce_frequent_words=True)
    model = BERTopic(n_gram_range=params["n_gram_range"],
                     nr_topics=NR_TOPICS_FIXED,
                     embedding_model=embedding_model,
                     vectorizer_model=vec, ctfidf_model=ctfidf,
                     umap_model=umap_m, hdbscan_model=hdb,
                     calculate_probabilities=True, verbose=False)
    topics, _ = model.fit_transform(docs, emb)
    score, td, coh = composite_score(model, emb, topics)
    n_topics = len([t for t in model.get_topic_info()["Topic"] if t != -1])
    return score, td, coh, n_topics, model

# ---------- Per-file pipeline ----------
def process_file(file_path: str):
    fname = os.path.basename(file_path)
    print(f"\n===== Processing {fname} =====")

    df = pd.read_csv(file_path, encoding="utf-8-sig")
    if TEXT_COL not in df.columns:
        print(f"[SKIP] No '{TEXT_COL}' column")
        return
    df = df[df[TEXT_COL].notna()].copy()
    df["clean_text"] = df[TEXT_COL].map(lambda x: clean_text_lemma(x, all_stop_words))
    df = df[df["clean_text"].str.split().map(len) >= MIN_TOKENS_AFTER_CLEAN]
    docs = df["clean_text"].tolist()
    if len(docs) < 5:
        print(f"[SKIP] Too few docs ({len(docs)})")
        return

    emb = encode_texts(docs)

    # Best param (고정된 단일 조합 사용)
    best_param = {"n_gram_range": (1,1), "n_neighbors": 10, "min_dist": 0.1, "min_cluster_size": 25}
    score, td, coh, n_topics, model = fit_evaluate(best_param, docs, emb)
    print(f"[DONE] {fname} | Topics: {n_topics} | Score: {score:.4f}")

    topics, probs = model.fit_transform(docs, emb)

    df["topic"] = topics
    df["topic_label"] = [", ".join([w for (w, _) in model.get_topic(t)[:10]]) if t != -1 else "Outlier" for t in topics]
    df["topic_prob"] = [float(np.nanmax(p)) if p is not None and len(p)>0 else np.nan for p in probs]

    topic_info = model.get_topic_info()
    topic_info.to_csv(os.path.join(DATA_DIR, f"{fname.replace('.csv', '')}_topic_summary.csv"), index=False)
    df.to_csv(os.path.join(DATA_DIR, f"{fname.replace('.csv', '')}_doc_topics.csv"), index=False)

    # ----------- 아래는 주석 처리된 부분 -----------
    # # (1) Hyperparam trials 저장
    # stage_df.to_csv(os.path.join(DATA_DIR, f"{fname.replace('.csv','')}_hp_trials.csv"), index=False)
    #
    # # (2) 시각화 HTML 저장
    # fig_bar = model.visualize_barchart(top_n_topics=NR_TOPICS_FIXED)
    # fig_bar.write_html(os.path.join(DATA_DIR, f"{fname.replace('.csv','')}_viz_barchart.html"))
    # ---------------------------------------------

    del df, emb, model
    gc.collect()

# ---------- Run ----------
if __name__ == "__main__":
    files = sorted(glob.glob(os.path.join(DATA_DIR, "*_gpt.csv")))
    for f in files:
        try:
            process_file(f)
        except Exception as e:
            print(f"[ERROR] {f}: {e}")
            traceback.print_exc()


[INFO] Using device: cuda

===== Processing 조선일보_2022년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 3/3 [00:02<00:00,  1.49it/s]


[DONE] 조선일보_2022년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9276

===== Processing 조선일보_2023년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.77it/s]


[DONE] 조선일보_2023년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9148

===== Processing 조선일보_2024년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


[DONE] 조선일보_2024년도 데이터_translated_gpt.csv | Topics: 3 | Score: 0.9272

===== Processing 한겨레_2022년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 6/6 [00:04<00:00,  1.43it/s]


[DONE] 한겨레_2022년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9298

===== Processing 한겨레_2023년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


[DONE] 한겨레_2023년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.8858

===== Processing 한겨레_2024년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


[DONE] 한겨레_2024년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9313


### 일본

In [5]:
import os
import re
import gc
import json
import glob
import traceback
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product

from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# ---------- PATHS ----------
BASE_DIR = "E:/Data_for_Practice/JapMedia/"
DATA_DIR = os.path.join(BASE_DIR, "data/jap_data/translated")
TEXT_COL = "영문 번역"

# ---------- CONFIG ----------
SEED = 42
EMBEDDING_NAME = "BAAI/bge-large-en-v1.5"
MIN_TOKENS_AFTER_CLEAN = 10
NR_TOPICS_FIXED = 5

MIN_DF_FIXED = 0.01
MAX_DF_FIXED = 0.90

TOPN_WORDS_FOR_DIVERSITY = 10
ALPHA_DIVERSITY = 0.6
BETA_COHESION = 0.4

ALLOWED_NGRAM = [(1,1), (1,2)]
ALLOWED_N_NEIGHBORS = [10, 15]
ALLOWED_MIN_DIST    = [0.1, 0.2]
ALLOWED_MIN_CLUSTER = [15, 25, 35]

np.random.seed(SEED)

# ---------- Device ----------
def pick_device_for_st():
    try:
        import torch
        return "cuda" if torch.cuda.is_available() else "cpu"
    except Exception:
        return "cpu"

DEVICE = pick_device_for_st()
print(f"[INFO] Using device: {DEVICE}")

# ---------- Stopwords ----------
NEWS_STOPWORDS_EXTRA = {
    "said","will","one","two","new","year","years","percent","also","could","would",
    "mr","ms","u","korea","south","seoul","korean","however","among","may","many",
    "made","make","like","since","according","including","told","say","says",
    "first","last","day","days","week","weeks","month","months","today","yesterday","tomorrow",
    "time","times","report","reports","reported","yonhap","reuters","ap",
    "people","company","companies","government","official","officials",
    "article","news","daily","kim","park","lee","cho","jang","chung","moon","yoon"
}
all_stop_words = set(ENGLISH_STOP_WORDS).union(NEWS_STOPWORDS_EXTRA)

# ---------- Preprocessing ----------
import spacy
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
except Exception:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def clean_text_lemma(t: str, stopwords: set) -> str:
    t = str(t).lower()
    t = re.sub(r"(https?://\S+|www\.\S+)", " ", t)
    t = re.sub(r"\b\d+\b", " ", t)
    t = re.sub(r"[^a-z\s\-]", " ", t)
    t = re.sub(r"\s{2,}", " ", t).strip()
    doc = nlp(t)
    lemmas = [
        token.lemma_ for token in doc
        if len(token.lemma_) > 2 and token.lemma_ not in stopwords and token.is_alpha
    ]
    return " ".join(lemmas)

# ---------- Embedding ----------
embedding_model = SentenceTransformer(EMBEDDING_NAME, device=DEVICE)
def encode_texts(texts):
    return embedding_model.encode(
        texts,
        batch_size=48 if DEVICE == "cpu" else 128,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=DEVICE
    )

# ---------- Scoring ----------
def topic_diversity(topic_model, topn=TOPN_WORDS_FOR_DIVERSITY):
    info = topic_model.get_topic_info()
    topic_ids = [t for t in info["Topic"].tolist() if t != -1]
    words = []
    for t in topic_ids:
        reps = topic_model.get_topic(t)
        if reps:
            words.extend([w for (w, _) in reps[:topn]])
    return len(set(words)) / (len(set(topic_ids)) * topn) if topic_ids else 0.0

def topic_cohesion(embeddings, labels):
    labels = np.array(labels)
    mask = labels != -1
    if mask.sum() < 2:
        return 0.0
    emb = embeddings[mask]
    lab = labels[mask]
    sims = []
    for t in np.unique(lab):
        idx = np.where(lab == t)[0]
        if len(idx) > 1:
            sub = emb[idx]
            centroid = np.mean(sub, axis=0, keepdims=True)
            sims.extend(cosine_similarity(sub, centroid).ravel())
    return np.mean(sims) if sims else 0.0

def composite_score(model, emb, labels, alpha=ALPHA_DIVERSITY, beta=BETA_COHESION):
    td = topic_diversity(model)
    coh = topic_cohesion(emb, labels)
    return alpha * td + beta * coh, td, coh

# ---------- Fit function ----------
def fit_evaluate(params, docs, emb):
    if len(docs) < 5: return -9999, 0, 0, 0, None
    vec = CountVectorizer(stop_words=list(all_stop_words),
                          ngram_range=params["n_gram_range"],
                          min_df=MIN_DF_FIXED, max_df=MAX_DF_FIXED)
    umap_m = UMAP(n_neighbors=params["n_neighbors"], min_dist=params["min_dist"],
                  n_components=10, metric="cosine", random_state=SEED)
    hdb = HDBSCAN(min_cluster_size=params["min_cluster_size"], min_samples=5,
                  metric="euclidean", cluster_selection_method="eom", prediction_data=True)
    ctfidf = ClassTfidfTransformer(reduce_frequent_words=True)
    model = BERTopic(n_gram_range=params["n_gram_range"],
                     nr_topics=NR_TOPICS_FIXED,
                     embedding_model=embedding_model,
                     vectorizer_model=vec, ctfidf_model=ctfidf,
                     umap_model=umap_m, hdbscan_model=hdb,
                     calculate_probabilities=True, verbose=False)
    topics, _ = model.fit_transform(docs, emb)
    score, td, coh = composite_score(model, emb, topics)
    n_topics = len([t for t in model.get_topic_info()["Topic"] if t != -1])
    return score, td, coh, n_topics, model

# ---------- Per-file pipeline ----------
def process_file(file_path: str):
    fname = os.path.basename(file_path)
    print(f"\n===== Processing {fname} =====")

    df = pd.read_csv(file_path, encoding="utf-8-sig")
    if TEXT_COL not in df.columns:
        print(f"[SKIP] No '{TEXT_COL}' column")
        return
    df = df[df[TEXT_COL].notna()].copy()
    df["clean_text"] = df[TEXT_COL].map(lambda x: clean_text_lemma(x, all_stop_words))
    df = df[df["clean_text"].str.split().map(len) >= MIN_TOKENS_AFTER_CLEAN]
    docs = df["clean_text"].tolist()
    if len(docs) < 5:
        print(f"[SKIP] Too few docs ({len(docs)})")
        return

    emb = encode_texts(docs)

    # Best param (고정된 단일 조합 사용)
    best_param = {"n_gram_range": (1,1), "n_neighbors": 10, "min_dist": 0.1, "min_cluster_size": 25}
    score, td, coh, n_topics, model = fit_evaluate(best_param, docs, emb)
    print(f"[DONE] {fname} | Topics: {n_topics} | Score: {score:.4f}")

    topics, probs = model.fit_transform(docs, emb)

    df["topic"] = topics
    df["topic_label"] = [", ".join([w for (w, _) in model.get_topic(t)[:10]]) if t != -1 else "Outlier" for t in topics]
    df["topic_prob"] = [float(np.nanmax(p)) if p is not None and len(p)>0 else np.nan for p in probs]

    topic_info = model.get_topic_info()
    topic_info.to_csv(os.path.join(DATA_DIR, f"{fname.replace('.csv', '')}_topic_summary.csv"), index=False)
    df.to_csv(os.path.join(DATA_DIR, f"{fname.replace('.csv', '')}_doc_topics.csv"), index=False)

    # ----------- 아래는 주석 처리된 부분 -----------
    # # (1) Hyperparam trials 저장
    # stage_df.to_csv(os.path.join(DATA_DIR, f"{fname.replace('.csv','')}_hp_trials.csv"), index=False)
    #
    # # (2) 시각화 HTML 저장
    # fig_bar = model.visualize_barchart(top_n_topics=NR_TOPICS_FIXED)
    # fig_bar.write_html(os.path.join(DATA_DIR, f"{fname.replace('.csv','')}_viz_barchart.html"))
    # ---------------------------------------------

    del df, emb, model
    gc.collect()

# ---------- Run ----------
if __name__ == "__main__":
    files = sorted(glob.glob(os.path.join(DATA_DIR, "*데이터.csv")))
    for f in files:
        try:
            process_file(f)
        except Exception as e:
            print(f"[ERROR] {f}: {e}")
            traceback.print_exc()


[INFO] Using device: cuda

===== Processing (영문번역 추가)_아사히신문_2022년도 데이터.csv =====


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.53it/s]


[DONE] (영문번역 추가)_아사히신문_2022년도 데이터.csv | Topics: 2 | Score: 0.9301

===== Processing (영문번역 추가)_아사히신문_2023년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


[DONE] (영문번역 추가)_아사히신문_2023년도 데이터.csv | Topics: 3 | Score: 0.9307

===== Processing (영문번역 추가)_아사히신문_2024년도 데이터.csv =====


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.40it/s]


[DONE] (영문번역 추가)_아사히신문_2024년도 데이터.csv | Topics: 4 | Score: 0.9327

===== Processing (영문번역 추가)_요미우리신문_2022년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:02<00:00,  1.81it/s]


[DONE] (영문번역 추가)_요미우리신문_2022년도 데이터.csv | Topics: 3 | Score: 0.9316

===== Processing (영문번역 추가)_요미우리신문_2023년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:03<00:00,  1.65it/s]


[DONE] (영문번역 추가)_요미우리신문_2023년도 데이터.csv | Topics: 4 | Score: 0.9334

===== Processing (영문번역 추가)_요미우리신문_2024년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:02<00:00,  1.82it/s]


[DONE] (영문번역 추가)_요미우리신문_2024년도 데이터.csv | Topics: 4 | Score: 0.9175
