### 한일 기후변화비교 연구
#### Step 2. 토픽 모델링

In [None]:
# Descriptive Statistics

import pandas as pd
import os
import re
import openpyxl

# BASE_DIR = "E:/Data_for_Practice/JapMedia/"
BASE_DIR = '/Users/keungouikim/Library/CloudStorage/GoogleDrive-awekimm@gmail.com/내 드라이브/[YU]/[Research]/24_JWKIM/'
DATA_DIR = os.path.join(BASE_DIR, "data/kor_data")

pattern = re.compile(r'^(?P<media>.+?)_(?P<year>20\d{2})')

records = []

for fname in os.listdir(DATA_DIR):
    if not fname.lower().endswith((".xlsx", ".xls")):
        continue
    
    m = pattern.search(fname)
    if m is None:
        continue
    
    media = m.group("media")
    year = int(m.group("year"))
    fpath = os.path.join(DATA_DIR, fname)

    df = pd.read_excel(fpath)  
    article_count = len(df)

    records.append({
        "media": media,
        "year": year,
        "file": fname,
        "article_count": article_count
    })

counts_long = pd.DataFrame(records).sort_values(["media", "year"])

counts_wide = counts_long.pivot_table(
    index="media",
    columns="year",
    values="article_count",
    aggfunc="sum",
    fill_value=0
).astype(int)

print("=== Long format (파일별) ===")
print(counts_long[["media", "year", "article_count", "file"]].to_string(index=False))

print("\n=== Wide format (언론사 x 연도) ===")
print(counts_wide.to_string())

=== Long format (파일별) ===
     media  year  article_count                             file
조선일보  2022            339 조선일보_2022년도 데이터.xlsx
조선일보  2023            402 조선일보_2023년도 데이터.xlsx
조선일보  2024            299 조선일보_2024년도 데이터.xlsx
   한겨레  2022            732    한겨레_2022년도 데이터.xlsx
   한겨레  2023            569    한겨레_2023년도 데이터.xlsx
   한겨레  2024            557    한겨레_2024년도 데이터.xlsx

=== Wide format (언론사 x 연도) ===
year        2022  2023  2024
media                       
조선일보   339   402   299
한겨레      732   569   557


In [1]:
import os
import re
import gc
import json
import glob
import traceback
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product

from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

BASE_DIR = "E:/Data_for_Practice/JapMedia/"
DATA_DIR = os.path.join(BASE_DIR, "data/kor_data")
EMB_DIR = os.path.join(BASE_DIR, "data/embed_figure")
TEXT_COL = "본문_en"

SEED = 42
EMBEDDING_NAME = "BAAI/bge-large-en-v1.5"
MIN_TOKENS_AFTER_CLEAN = 10
NR_TOPICS_FIXED = 5

MIN_DF_FIXED = 0.01
MAX_DF_FIXED = 0.90

TOPN_WORDS_FOR_DIVERSITY = 10
ALPHA_DIVERSITY = 0.6
BETA_COHESION = 0.4

np.random.seed(SEED)

# Device 
def pick_device_for_st():
    try:
        import torch
        return "cuda" if torch.cuda.is_available() else "cpu"
    except:
        return "cpu"

DEVICE = pick_device_for_st()
print(f"[INFO] Using device: {DEVICE}")

# Stopwords 
NEWS_STOPWORDS_EXTRA = {
    "said","will","one","two","new","year","years","percent","also","could","would",
    "mr","ms","u","korea","south","seoul","korean","however","among","may","many",
    "made","make","like","since","according","including","told","say","says",
    "first","last","day","days","week","weeks","month","months","today","yesterday","tomorrow",
    "time","times","report","reports","reported","yonhap","reuters","ap",
    "people","company","companies","government","official","officials",
    "article","news","daily","kim","park","lee","cho","jang","chung","moon","yoon"
}
all_stop_words = set(ENGLISH_STOP_WORDS).union(NEWS_STOPWORDS_EXTRA)

# Preprocessing 
import spacy
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
except Exception:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def clean_text_lemma(t: str, stopwords: set) -> str:
    t = str(t).lower()
    t = re.sub(r"(https?://\S+|www\.\S+)", " ", t)
    t = re.sub(r"\b\d+\b", " ", t)
    t = re.sub(r"[^a-z\s\-]", " ", t)
    t = re.sub(r"\s{2,}", " ", t).strip()
    doc = nlp(t)
    lemmas = [
        token.lemma_ for token in doc
        if len(token.lemma_) > 2 and token.lemma_ not in stopwords and token.is_alpha
    ]
    return " ".join(lemmas)

# Embedding 
embedding_model = SentenceTransformer(EMBEDDING_NAME, device=DEVICE)
def encode_texts(texts):
    return embedding_model.encode(
        texts,
        batch_size=48 if DEVICE == "cpu" else 128,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=DEVICE
    )

# Scoring
def topic_diversity(topic_model, topn=TOPN_WORDS_FOR_DIVERSITY):
    info = topic_model.get_topic_info()
    topic_ids = [t for t in info["Topic"].tolist() if t != -1]
    words = []
    for t in topic_ids:
        reps = topic_model.get_topic(t)
        if reps:
            words.extend([w for (w, _) in reps[:topn]])
    return len(set(words)) / (len(set(topic_ids)) * topn) if topic_ids else 0.0

def topic_cohesion(embeddings, labels):
    labels = np.array(labels)
    mask = labels != -1
    if mask.sum() < 2:
        return 0.0
    emb = embeddings[mask]
    lab = labels[mask]
    sims = []
    for t in np.unique(lab):
        idx = np.where(lab == t)[0]
        if len(idx) > 1:
            sub = emb[idx]
            centroid = np.mean(sub, axis=0, keepdims=True)
            sims.extend(cosine_similarity(sub, centroid).ravel())
    return np.mean(sims) if sims else 0.0

def composite_score(model, emb, labels, alpha=ALPHA_DIVERSITY, beta=BETA_COHESION):
    td = topic_diversity(model)
    coh = topic_cohesion(emb, labels)
    return alpha * td + beta * coh, td, coh

# Fit function 
def fit_evaluate(params, docs, emb):
    if len(docs) < 5:
        return -9999, 0, 0, 0, None

    vec = CountVectorizer(stop_words=list(all_stop_words),
                          ngram_range=params["n_gram_range"],
                          min_df=MIN_DF_FIXED, max_df=MAX_DF_FIXED)

    umap_m = UMAP(
        n_neighbors=params["n_neighbors"],
        min_dist=params["min_dist"],
        n_components=10,
        metric="cosine",
        random_state=SEED
    )

    hdb = HDBSCAN(
        min_cluster_size=params["min_cluster_size"],
        min_samples=5,
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True
    )

    ctfidf = ClassTfidfTransformer(reduce_frequent_words=True)

    model = BERTopic(
        n_gram_range=params["n_gram_range"],
        nr_topics=NR_TOPICS_FIXED,
        embedding_model=embedding_model,
        vectorizer_model=vec,
        ctfidf_model=ctfidf,
        umap_model=umap_m,
        hdbscan_model=hdb,
        calculate_probabilities=True,
        verbose=False
    )

    topics, _ = model.fit_transform(docs, emb)
    score, td, coh = composite_score(model, emb, topics)
    n_topics = len([t for t in model.get_topic_info()["Topic"] if t != -1])

    return score, td, coh, n_topics, model

# Per-file pipeline 
def process_file(file_path: str):
    fname = os.path.basename(file_path)
    print(f"\n===== Processing {fname} =====")

    df = pd.read_csv(file_path, encoding="utf-8-sig")
    if TEXT_COL not in df.columns:
        print(f"[SKIP] No '{TEXT_COL}' column")
        return

    # newspaper / year parsing from filename 
    base_no_ext = os.path.splitext(fname)[0]

    m = re.search(r"^([^_]+)_(\d{4})년도", base_no_ext)
    if m:
        newspaper = m.group(1)
        year = int(m.group(2))
    else:
        newspaper = "UNKNOWN"
        year = None
        print(f"[WARN] Could not parse newspaper/year from filename: {fname}")

    # preprocessing 
    df = df[df[TEXT_COL].notna()].copy()
    df["clean_text"] = df[TEXT_COL].map(lambda x: clean_text_lemma(x, all_stop_words))
    df = df[df["clean_text"].str.split().map(len) >= MIN_TOKENS_AFTER_CLEAN]
    docs = df["clean_text"].tolist()
    if len(docs) < 5:
        print(f"[SKIP] Too few docs ({len(docs)})")
        return

    # embedding 
    emb = encode_texts(docs)

    # topic modeling 
    best_param = {"n_gram_range": (1,1), "n_neighbors": 10, "min_dist": 0.1, "min_cluster_size": 25}
    score, td, coh, n_topics, model = fit_evaluate(best_param, docs, emb)
    print(f"[DONE] {fname} | Topics: {n_topics} | Score: {score:.4f}")

    topics, probs = model.fit_transform(docs, emb)

    df["topic"] = topics
    df["topic_label"] = [
        ", ".join([w for (w, _) in model.get_topic(t)[:10]]) if t != -1 else "Outlier"
        for t in topics
    ]
    df["topic_prob"] = [
        float(np.nanmax(p)) if p is not None and len(p) > 0 else np.nan
        for p in probs
    ]

    df["newspaper"] = newspaper
    df["year"] = year

    emb_dim = emb.shape[1]
    emb_cols = [f"emb_{i}" for i in range(emb_dim)]
    emb_df = pd.DataFrame(emb, columns=emb_cols).reset_index(drop=True)
    df = df.reset_index(drop=True)
    df_with_emb = pd.concat([df, emb_df], axis=1)

    save_prefix = os.path.splitext(fname)[0]
    df_with_emb.to_csv(os.path.join(EMB_DIR, f"{save_prefix}_doc_topics_with_emb.csv"), index=False)
    model.get_topic_info().to_csv(os.path.join(DATA_DIR, f"{save_prefix}_topic_summary.csv"), index=False)

    del df, model, emb, df_with_emb
    gc.collect()

if __name__ == "__main__":
    files = sorted(glob.glob(os.path.join(DATA_DIR, "*_gpt.csv")))
    for f in files:
        try:
            process_file(f)
        except Exception as e:
            print(f"[ERROR] {f}: {e}")
            traceback.print_exc()

  from .autonotebook import tqdm as notebook_tqdm


[INFO] Using device: cuda

===== Processing 조선일보_2022년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 3/3 [00:02<00:00,  1.32it/s]


[DONE] 조선일보_2022년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9276

===== Processing 조선일보_2023년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.74it/s]


[DONE] 조선일보_2023년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9148

===== Processing 조선일보_2024년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 3/3 [00:01<00:00,  1.75it/s]


[DONE] 조선일보_2024년도 데이터_translated_gpt.csv | Topics: 3 | Score: 0.9272

===== Processing 한겨레_2022년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 6/6 [00:04<00:00,  1.45it/s]


[DONE] 한겨레_2022년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9298

===== Processing 한겨레_2023년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


[DONE] 한겨레_2023년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.8858

===== Processing 한겨레_2024년도 데이터_translated_gpt.csv =====


Batches: 100%|██████████| 5/5 [00:03<00:00,  1.48it/s]


[DONE] 한겨레_2024년도 데이터_translated_gpt.csv | Topics: 4 | Score: 0.9313


### 일본

In [2]:
import os
import re
import gc
import glob
import traceback
import numpy as np
import pandas as pd
from tqdm import tqdm

# Sentence-transformers / BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# ---------- PATHS ----------
BASE_DIR = "E:/Data_for_Practice/JapMedia/"
DATA_DIR = os.path.join(BASE_DIR, "data/jap_data/translated")
EMB_DIR = os.path.join(BASE_DIR, "data/embed_figure")
TEXT_COL = "영문 번역"

# ---------- CONFIG ----------
SEED = 42
EMBEDDING_NAME = "BAAI/bge-large-en-v1.5"
MIN_TOKENS_AFTER_CLEAN = 10
NR_TOPICS_FIXED = 5

MIN_DF_FIXED = 0.01
MAX_DF_FIXED = 0.90

# ---------- Simple Stopwords ----------
NEWS_STOPWORDS_EXTRA = {
    "said","will","one","two","new","year","years","percent","also","could","would",
    "mr","ms","u","korea","south","seoul","korean","however","among","may","many",
    "made","make","like","since","according","including","told","say","says",
    "first","last","day","days","week","weeks","month","months","today","yesterday","tomorrow",
    "time","times","report","reports","reported","yonhap","reuters","ap",
    "people","company","companies","government","official","officials",
    "article","news","daily","kim","park","lee","cho","jang","chung","moon","yoon"
}
all_stop_words = set(ENGLISH_STOP_WORDS).union(NEWS_STOPWORDS_EXTRA)

# ---------- Device ----------
def pick_device_for_st():
    try:
        import torch
        return "cuda" if torch.cuda.is_available() else "cpu"
    except:
        return "cpu"

DEVICE = pick_device_for_st()
print(f"[INFO] Using device: {DEVICE}")

# ---------- Preprocessing (spacy lemmatizer) ----------
import spacy
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
except:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def clean_text_lemma(t: str, stopwords: set) -> str:
    t = str(t).lower()
    t = re.sub(r"(https?://\S+|www\.\S+)", " ", t)
    t = re.sub(r"\b\d+\b", " ", t)
    t = re.sub(r"[^a-z\s\-]", " ", t)
    t = re.sub(r"\s{2,}", " ", t).strip()
    doc = nlp(t)
    lemmas = [token.lemma_ for token in doc
              if len(token.lemma_) > 2 and token.lemma_ not in stopwords and token.is_alpha]
    return " ".join(lemmas)

# ---------- Embedding Model ----------
embedding_model = SentenceTransformer(EMBEDDING_NAME, device=DEVICE)

def encode_texts(texts):
    return embedding_model.encode(
        texts,
        batch_size=48 if DEVICE == "cpu" else 128,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=DEVICE
    )

# ---------- Topic Model Scoring ----------
def topic_diversity(topic_model, topn=10):
    info = topic_model.get_topic_info()
    topic_ids = [t for t in info["Topic"] if t != -1]
    words = []
    for t in topic_ids:
        reps = topic_model.get_topic(t)
        if reps:
            words.extend([w for (w, _) in reps[:topn]])
    return len(set(words)) / (len(topic_ids) * topn) if len(topic_ids) else 0

def topic_cohesion(embeddings, labels):
    labels = np.array(labels)
    mask = labels != -1
    if mask.sum() < 2:
        return 0.0
    emb = embeddings[mask]
    lab = labels[mask]
    sims = []
    for t in np.unique(lab):
        idx = np.where(lab == t)[0]
        if len(idx) > 1:
            sub = emb[idx]
            centroid = np.mean(sub, axis=0, keepdims=True)
            sims.extend(cosine_similarity(sub, centroid).ravel())
    return np.mean(sims) if sims else 0.0

def composite_score(model, emb, labels):
    td = topic_diversity(model)
    coh = topic_cohesion(emb, labels)
    return 0.6 * td + 0.4 * coh, td, coh

# ---------- Fit-Evaluate ----------
def fit_evaluate(params, docs, emb):
    if len(docs) < 5:
        return -9999, 0, 0, 0, None

    vec = CountVectorizer(
        stop_words=list(all_stop_words),
        ngram_range=params["n_gram_range"],
        min_df=MIN_DF_FIXED,
        max_df=MAX_DF_FIXED
    )

    umap_m = UMAP(
        n_neighbors=params["n_neighbors"],
        min_dist=params["min_dist"],
        n_components=10,
        metric="cosine",
        random_state=SEED
    )

    hdb = HDBSCAN(
        min_cluster_size=params["min_cluster_size"],
        min_samples=5,
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True
    )

    ctfidf = ClassTfidfTransformer(reduce_frequent_words=True)

    model = BERTopic(
        n_gram_range=params["n_gram_range"],
        nr_topics=NR_TOPICS_FIXED,
        embedding_model=embedding_model,
        vectorizer_model=vec,
        ctfidf_model=ctfidf,
        umap_model=umap_m,
        hdbscan_model=hdb,
        calculate_probabilities=True,
        verbose=False
    )

    topics, _ = model.fit_transform(docs, emb)
    score, td, coh = composite_score(model, emb, topics)
    n_topics = len([t for t in model.get_topic_info()["Topic"] if t != -1])

    return score, td, coh, n_topics, model


# ---------- File Processor ----------
import re

def process_file(file_path: str):
    fname = os.path.basename(file_path)
    print(f"\n===== Processing {fname} =====")

    df = pd.read_csv(file_path, encoding="utf-8-sig")
    if TEXT_COL not in df.columns:
        print(f"[SKIP] No '{TEXT_COL}' column")
        return

    # ---------- 파일명에서 신문사 / 연도 파싱 ----------
    # 예: "(영문번역 추가)_아사히신문_2022년도 데이터.csv"
    base_no_ext = os.path.splitext(fname)[0]

    # 신문사 이름: 첫 번째 언더스코어 이후, 두 번째 언더스코어 이전
    # (영문번역 추가)_아사히신문_2022년도 데이터  -> "아사히신문"
    m_np = re.search(r"_([^_]+)_\d{4}년도", base_no_ext)
    if m_np:
        newspaper = m_np.group(1)
    else:
        newspaper = "UNKNOWN"
        print(f"[WARN] Could not parse newspaper name from filename: {fname}")

    # 연도: "2022년도", "2023년도" 등에서 4자리 숫자만 추출
    m_year = re.search(r"_(\d{4})년도", base_no_ext)
    if m_year:
        year = int(m_year.group(1))
    else:
        year = None
        print(f"[WARN] Could not parse year from filename: {fname}")

    # ---------- 텍스트 전처리 ----------
    df = df[df[TEXT_COL].notna()].copy()
    df["clean_text"] = df[TEXT_COL].map(lambda x: clean_text_lemma(x, all_stop_words))
    df = df[df["clean_text"].str.split().map(len) >= MIN_TOKENS_AFTER_CLEAN]
    docs = df["clean_text"].tolist()
    if len(docs) < 5:
        print(f"[SKIP] Too few docs ({len(docs)})")
        return

    # ---------- 문서 임베딩 ----------
    emb = encode_texts(docs)   # shape: (n_docs, d)

    # ---------- 토픽 모델링 ----------
    best_param = {"n_gram_range": (1,1), "n_neighbors": 10, "min_dist": 0.1, "min_cluster_size": 25}
    score, td, coh, n_topics, model = fit_evaluate(best_param, docs, emb)
    print(f"[DONE] {fname} | Topics: {n_topics} | Score: {score:.4f}")

    topics, probs = model.fit_transform(docs, emb)

    df["topic"] = topics
    df["topic_label"] = [
        ", ".join([w for (w, _) in model.get_topic(t)[:10]]) if t != -1 else "Outlier"
        for t in topics
    ]
    df["topic_prob"] = [
        float(np.nanmax(p)) if p is not None and len(p) > 0 else np.nan
        for p in probs
    ]

    # ---------- 메타데이터 추가 ----------
    df["newspaper"] = newspaper
    df["year"] = year

    # ---------- 임베딩을 컬럼으로 붙이기 ----------
    emb_dim = emb.shape[1]
    emb_cols = [f"emb_{i}" for i in range(emb_dim)]
    emb_df = pd.DataFrame(emb, columns=emb_cols).reset_index(drop=True)
    df = df.reset_index(drop=True)
    df_with_emb = pd.concat([df, emb_df], axis=1)

    # ---------- 저장 ----------
    base_for_save = os.path.splitext(fname)[0]   # 확장자 제거
    topic_info = model.get_topic_info()
    topic_info.to_csv(os.path.join(DATA_DIR, f"{base_for_save}_topic_summary.csv"), index=False)

    df_with_emb.to_csv(os.path.join(EMB_DIR, f"{base_for_save}_doc_topics_with_emb.csv"), index=False)

    # 메모리 정리
    del df, emb, model, df_with_emb
    gc.collect()

# ---------- MAIN ----------
if __name__ == "__main__":
    files = sorted(glob.glob(os.path.join(DATA_DIR, "*데이터.csv")))
    for f in files:
        try:
            process_file(f)
        except Exception as e:
            print(f"[ERROR] in {f}: {e}")
            traceback.print_exc()


[INFO] Using device: cuda

===== Processing (영문번역 추가)_아사히신문_2022년도 데이터.csv =====


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.50it/s]


[DONE] (영문번역 추가)_아사히신문_2022년도 데이터.csv | Topics: 2 | Score: 0.9301

===== Processing (영문번역 추가)_아사히신문_2023년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:02<00:00,  1.71it/s]


[DONE] (영문번역 추가)_아사히신문_2023년도 데이터.csv | Topics: 3 | Score: 0.9307

===== Processing (영문번역 추가)_요미우리신문_2022년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:02<00:00,  1.82it/s]


[DONE] (영문번역 추가)_요미우리신문_2022년도 데이터.csv | Topics: 3 | Score: 0.9316

===== Processing (영문번역 추가)_요미우리신문_2024년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:02<00:00,  1.72it/s]


[DONE] (영문번역 추가)_요미우리신문_2024년도 데이터.csv | Topics: 4 | Score: 0.9175

===== Processing 20251112_중복기사 삭제완료_(영문번역 추가)_아사히신문_2024년도 데이터.csv =====


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.40it/s]


[DONE] 20251112_중복기사 삭제완료_(영문번역 추가)_아사히신문_2024년도 데이터.csv | Topics: 4 | Score: 0.9327

===== Processing 20251112_중복기사 삭제완료_(영문번역 추가)_요미우리신문_2023년도 데이터.csv =====


Batches: 100%|██████████| 5/5 [00:03<00:00,  1.61it/s]


[DONE] 20251112_중복기사 삭제완료_(영문번역 추가)_요미우리신문_2023년도 데이터.csv | Topics: 4 | Score: 0.9334
