### 한국

In [1]:
import os, re, json, glob, time, hashlib
import pandas as pd
import ollama

# ===========================
# USER SETTINGS — HERE ONLY!
# ===========================
DATA_FOLDER = r"E:/Data_for_Practice/JapMedia/data/kor_data/"
FILE_PATTERN = "*topic_summary.csv"
MODEL_NAME   = "llama4"
TEMP         = 0.5
NPRED        = 120  # enough for JSON

# ---------------------------
# TOKENIZATION + CLEANING
# ---------------------------
_WORD_SPLIT_RE = re.compile(r"[\s\-/_,]+", re.UNICODE)
_PUNCT_TAIL_RE = re.compile(r"[\s\-–—:;,!.?\"'`]+$")

def clean_label(text: str, max_words: int = 7) -> str:
    if not text:
        return ""
    text = text.strip().strip('"').strip("'").strip("`")
    text = _PUNCT_TAIL_RE.sub("", text)
    text = re.sub(r"\s+", " ", text)
    toks = [t for t in _WORD_SPLIT_RE.split(text) if t][:max_words]

    def smart_cap(w):
        return w if (w.isupper() and len(w) <= 5) else w.capitalize()

    return " ".join(smart_cap(w) for w in toks)

# ---------------------------
# DOMAIN LEXICONS
# ---------------------------
GENERIC_FORBIDDEN = set("""issue problem topic situation things general policy measure project plan""".split())

SECTORS = set("""
power electricity grid utilities industry heavy industry cement steel chemicals
buildings housing heating cooling transport shipping aviation agriculture forestry fisheries finance
""".split())

TECH = set("""
renewables solar pv wind hydro nuclear smr battery storage heatpump ev charging hydrogen ammonia
ccus ccs cdr dac bioenergy biofuel retrofit electrification interconnection smartmeter smartgrid
""".split())

POLICY = set("""
cbam ets rps re100 esg taxonomy carbon tax carbon pricing feed-in tariff offset credits
scope1 scope2 scope3 tcfd csrd issb esrs pcf disclosure subsidy subsidies
""".split())

GAS = set("""
co2 ch4 methane n2o nox so2 pm25 pm2.5 pm10 black carbon sf6
""".split())

GEO = set("""
korea korean seoul busan incheon japan china taiwan india asean eu europe germany france italy spain uk usa u.s. america canada
""".split())

YEAR_RE = re.compile(r"\b(20[1-5]\d)\b")

# ---------------------------
# ENTITY EXTRACTION
# ---------------------------
def extract_entities(text: str):
    toks = [t.lower() for t in _WORD_SPLIT_RE.split(text) if t]
    ent = {
        "policy": [t for t in toks if t in POLICY][:3],
        "sector": [t for t in toks if t in SECTORS][:3],
        "tech":   [t for t in toks if t in TECH][:3],
        "gas":    [t for t in toks if t in GAS][:3],
        "geo":    [t for t in toks if t in GEO][:3],
        "years":  YEAR_RE.findall(text)[:3],
    }
    return ent

# ---------------------------
# LABEL QUALITY CHECKS
# ---------------------------
def is_too_generic(label: str) -> bool:
    if not label:
        return True
    toks = [t.lower() for t in _WORD_SPLIT_RE.split(label) if t]
    if len(toks) < 2:
        return True
    if any(t in GENERIC_FORBIDDEN for t in toks):
        return True
    return False

def lacks_specificity(label: str, ents: dict) -> bool:
    if not label: 
        return True
    l = label.lower()
    groups = ["policy","sector","tech","gas","geo","years"]
    hit = 0
    for g in groups:
        for t in ents[g]:
            if str(t).lower() in l:
                hit += 1; break
    return hit < 2  # 최소 두 그룹 언급되어야 충분히 구체적

# ---------------------------
# ENTITY ENRICHMENT / FALLBACK
# ---------------------------
def enrich_label_with_entities(label: str, ents: dict, max_words=7) -> str:
    base = label or "Climate Policy"
    piece = []
    if ents["policy"]:
        p = ents["policy"][0]
        piece.append(p.upper() if p in {"cbam","ets","rps","re100","csrd","issb","esrs"} else p.title())
    if ents["sector"]:
        piece.append(ents["sector"][0].title())
    if ents["tech"]:
        t = ents["tech"][0]
        piece.append(t.upper() if t in {"ccus","ccs","cdr","dac","smr"} else t.title())
    extra = ents["gas"] or ents["geo"] or ents["years"]
    if extra:
        x = extra[0]
        x = x.upper() if x.lower() in {"co2","ch4","pm2.5","pm10","nox","so2"} else x.title()
        piece.append(x)
    add = " ".join(piece[:3])
    if add and add.lower() not in base.lower():
        base = f"{base} – {add}"
    return clean_label(base, max_words)

def synthesize_label_from_entities(ents: dict) -> str:
    part = []
    if ents["policy"]:
        p = ents["policy"][0]
        part.append(p.upper() if p in {"cbam","ets","rps","re100"} else p.title())
    if ents["sector"]:
        part.append(ents["sector"][0].title())
    if ents["tech"]:
        t = ents["tech"][0]
        part.append(t.upper() if t in {"ccus","ccs","cdr","dac","smr"} else t.title())
    extra = (ents["gas"] or ents["geo"] or ents["years"])
    if extra:
        x = extra[0]
        part.append(x.upper() if x.lower() in {"co2","ch4","pm2.5","pm10"} else x.title())
    return clean_label(" ".join(part) if part else "Climate Policy Specifics", max_words=7)

# ---------------------------
# FRAME GUESS (table-based)
# ---------------------------
FRAME_KEYWORDS = {
    "Economic Costs/Benefits Frame": set("""
        subsidy subsidies tax credit credits reverse discrimination eligibility benefit benefits
        supply-chain supply chain export price prices electricity fuel insurance loss losses disaster damage damages
        competitiveness employment relocation ets carbon tax
    """.split()),

    "Technological Transition / Industrial Competition Frame": set("""
        ev battery batteries origin localization supply chain siting investment
        renewable renewables solar wind nuclear hydrogen
        ccs ccus smart grid smartgrid infrastructure storage
        industry transition
    """.split()),

    "Political Imbalance / Institutions & Geopolitics Frame": set("""
        protectionism norms diplomacy negotiation negotiations cop geopolitics partisan republican democrat
        japan korea korea-japan u.s. us trade conflict governance institution institutions regulation assembly
        summit letter letters retaliation dispute
    """.split()),

    "Climate Crisis Response & Justice Frame": set("""
        net-zero neutrality roadmap mitigation adaptation just transition justice youth citizen civic activism
        science expert experts fact-check fact-checking biodiversity disaster health model models risk risks
        lifestyle behaviour behavior
    """.split())
}
FRAME_ORDER = [
    "Technological Transition / Industrial Competition Frame",
    "Economic Costs/Benefits Frame",
    "Political Imbalance / Institutions & Geopolitics Frame",
    "Climate Crisis Response & Justice Frame"
]
def guess_frame_from_keywords_v2(text: str) -> str:
    toks = set(t.lower() for t in _WORD_SPLIT_RE.split(text) if t)
    scores = {f: len(toks & words) for f, words in FRAME_KEYWORDS.items()}
    return max(FRAME_ORDER, key=lambda f: scores[f])

# ---------------------------
# FRAME-AWARE PROMPT (표 반영) + AVOID LIST
# ---------------------------
def build_prompt_bestframe(keywords: str, previous_label: str = "", row_id=-1, avoid=None):
    avoid_text = ""
    if avoid:
        uniq = sorted({clean_label(a) for a in set(avoid) if a})[-30:]
        if uniq:
            avoid_text = "\nDo NOT reuse any of the following labels:\n- " + "\n- ".join(uniq)

    prev = (f"\nThe previous label was too generic: '{previous_label}'. Generate a more specific label."
            if previous_label else "")

    return f"""
ROW_ID: {row_id}

You must classify the topic described by the following keywords:
{keywords}

Pick ONE BEST-FITTING FRAME from the four below.
If multiple frames seem possible, choose the one MOST DIRECTLY connected to the core meaning of the keywords.

================ FRAME DEFINITIONS ================

1) Economic Costs/Benefits Frame
   - Costs/benefits of climate policy or IRA
   - subsidies, tax credits, reverse discrimination, eligibility, supply-chain disruptions, energy prices, fuel costs, disasters/losses, employment, competitiveness, carbon tax/ETS

2) Technological Transition / Industrial Competition Frame
   - Energy/industrial transition, technological innovation, investment/siting, supply chains
   - EV/battery rules of origin, localization, green investment expansion, renewables, nuclear mix, CCS, hydrogen, smart grid, grid/storage infrastructure

3) Political Imbalance / Institutions & Geopolitics Frame
   - Trade, diplomacy, institutional conflict, governance issues
   - protectionism, reverse discrimination, Japan–Korea/U.S.–Korea responses, U.S. partisan conflict, domestic politics & regulation, COP negotiations, local government conflicts

4) Climate Crisis Response & Justice Frame
   - Carbon neutrality, state intervention, corporate leadership, civic action, justice
   - net-zero roadmaps, regulation/subsidy needs, corporate strategies, youth justice action, scientific evidence/models, physical impacts (disaster/health/biodiversity), adaptation, just transition

===================================================

LABEL REQUIREMENTS:
- Provide a SPECIFIC noun phrase (≤ 7 words, Title Case, no period)
- Avoid generic terms (issue, topic, problem, situation)
- Avoid any labels listed in {avoid_text}

OUTPUT ONLY ONE LINE OF MINIFIED JSON:
{{"frame":"...","label":"...","confidence":0.0}}

{prev}
""".strip()

# ---------------------------
# MODEL CALL (JSON MODE)
# ---------------------------
def _seed_from_text(text: str, row_idx: int = 0, file_name: str = "") -> int:
    base = f"{text}|{row_idx}|{file_name}"
    h = int(hashlib.md5(base.encode("utf-8")).hexdigest()[:8], 16)
    return h & 0x7fffffff

def ask_ollama_json(prompt: str, keywords: str, row_idx: int = 0, file_name: str = ""):
    seed = _seed_from_text(keywords, row_idx=row_idx, file_name=file_name)
    r = ollama.chat(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "Return ONLY valid minified JSON, no prose."},
            {"role": "user", "content": prompt},
        ],
        options={
            "temperature": TEMP,
            "num_predict": NPRED,
            "seed": seed,
            "format": "json",
            "top_p": 0.9
        }
    )
    return json.loads(r["message"]["content"])

# ---------------------------
# GLOBAL DE-DUP (final guard)
# ---------------------------
def ensure_unique_label(label: str, ents: dict, used: set, keywords: str, max_words=7) -> str:
    cand = clean_label(label, max_words)
    if cand and cand not in used:
        used.add(cand); return cand

    c1 = enrich_label_with_entities(cand, ents, max_words)
    if c1 and c1 not in used:
        used.add(c1); return c1

    c2 = synthesize_label_from_entities(ents)
    if c2 and c2 not in used:
        used.add(c2); return c2

    suf = hashlib.md5(keywords.encode("utf-8")).hexdigest()[:4].upper()
    c3 = clean_label(f"{cand} {suf}", max_words+1)
    used.add(c3); return c3

# ---------------------------
# MAIN EXECUTION
# ---------------------------
def run_bestframe_specific():
    files = sorted(glob.glob(os.path.join(DATA_FOLDER, FILE_PATTERN)))
    if not files:
        raise FileNotFoundError(f"No files found in {DATA_FOLDER} matching {FILE_PATTERN}")

    out, used_labels = [], set()   # 전역 중복 추적

    for file in files:
        print(f"\n[Processing] {os.path.basename(file)}")
        df = pd.read_csv(file, encoding="utf-8-sig")

        rep_col = "Representation" if "Representation" in df.columns else (
                  "Represenation" if "Represenation" in df.columns else None)
        if rep_col is None:
            raise KeyError(f"No Representation column in {file}")

        for i, row in df.iterrows():
            keywords = str(row[rep_col]).strip()
            ents = extract_entities(keywords)

            frame, label, conf = "", "", ""
            used_model = False

            try:
                prompt = build_prompt_bestframe(keywords, row_id=i+1, avoid=used_labels)
                data = ask_ollama_json(prompt, keywords, row_idx=i+1, file_name=os.path.basename(file))
                frame = data.get("frame","").strip()
                label = clean_label(data.get("label",""))
                conf  = data.get("confidence","")
                used_model = True
            except Exception as e:
                print(f"   [warn] model fail row {i+1}: {e}")
                frame = guess_frame_from_keywords_v2(keywords)

            if (not label) or is_too_generic(label) or lacks_specificity(label, ents) or (label in used_labels):
                try:
                    prompt2 = build_prompt_bestframe(keywords, previous_label=label, row_id=f"{i+1}-retry", avoid=used_labels)
                    data2 = ask_ollama_json(prompt2, keywords, row_idx=i+1, file_name=os.path.basename(file))
                    frame = data2.get("frame", frame).strip()
                    label2 = clean_label(data2.get("label",""))
                    if label2:
                        label = label2
                        conf = data2.get("confidence","")
                        used_model = True
                except Exception as e:
                    print(f"   [warn] retry fail row {i+1}: {e}")

            # 전역 중복 방지 최종 단계
            label = ensure_unique_label(label, ents, used_labels, keywords)

            src = "MODEL" if used_model else "FALLBACK"
            print(f"  - {i+1}: [{frame}] {label} <{src}>")

            out.append({
                "file": os.path.basename(file),
                "Topic": row.get("Topic", ""),
                "Count": row.get("Count", ""),
                "Representation": keywords,
                "best_frame": frame,
                "best_frame_label": label,
                "confidence": conf,
                "source": src
            })

    return pd.DataFrame(out)

# ---------------------------
# RUN & SAVE
# ---------------------------
df_labels = run_bestframe_specific()
out_path = os.path.join(DATA_FOLDER, "bestframe_labels_specific_KR.csv")
df_labels.to_csv(out_path, encoding="utf-8-sig", index=False)
print(f"\n[SAVED] {out_path}")
df_labels.head()


[Processing] 조선일보_2022년도 데이터_translated_gpt_topic_summary.csv
  - 1: [Climate Crisis Response & Justice Frame] Agricultural Sustainability <MODEL>
  - 2: [Technological Transition] EV Battery Industry Competition <MODEL>
  - 3: [Climate Crisis Response & Justice Frame] Extreme Weather Event Mitigation <MODEL>
  - 4: [Political Imbalance / Institutions & Geopolitics] Taiwan Diplomatic Relations <MODEL>
  - 5: [Technological Transition] Small Modular Nuclear Reactor Deployment <MODEL>

[Processing] 조선일보_2023년도 데이터_translated_gpt_topic_summary.csv
  - 1: [Economic Costs/Benefits Frame] Global Grain Price Volatility <MODEL>
  - 2: [Climate Crisis Response & Justice Frame] Weather Disaster Resilience <MODEL>
  - 3: [Political Imbalance / Institutions & Geopolitics] Indo Saudi Presidential Summit Security <MODEL>
  - 4: [Technological Transition] Renewable Energy Industrial Transition <MODEL>
  - 5: [Climate Crisis Response & Justice Frame] Wildfire Disaster Response <MODEL>

[Processing] 조

Unnamed: 0,file,Topic,Count,Representation,best_frame,best_frame_label,confidence,source
0,조선일보_2022년도 데이터_translated_gpt_topic_summary.csv,-1,48,"['gene', 'seed', 'population', 'travel', 'grai...",Climate Crisis Response & Justice Frame,Agricultural Sustainability,0.8,MODEL
1,조선일보_2022년도 데이터_translated_gpt_topic_summary.csv,0,59,"['esg', 'etfs', 'battery', 'republican', 'stoc...",Technological Transition,EV Battery Industry Competition,0.9,MODEL
2,조선일보_2022년도 데이터_translated_gpt_topic_summary.csv,1,111,"['rain', 'flood', 'rainfall', 'river', 'heat',...",Climate Crisis Response & Justice Frame,Extreme Weather Event Mitigation,0.9,MODEL
3,조선일보_2022년도 데이터_translated_gpt_topic_summary.csv,2,93,"['pacific', 'relation', 'peace', 'diplomatic',...",Political Imbalance / Institutions & Geopolitics,Taiwan Diplomatic Relations,0.9,MODEL
4,조선일보_2022년도 데이터_translated_gpt_topic_summary.csv,3,25,"['taxonomy', 'nuclearization', 'radioactive', ...",Technological Transition,Small Modular Nuclear Reactor Deployment,0.8,MODEL


### 일본

In [2]:
#############################################
# BEST FRAME + SPECIFIC LABEL GENERATOR
# Jupyter Notebook – De-dup + Frame-aware, Full
#############################################

import os, re, json, glob, time, hashlib
import pandas as pd
import ollama

# ===========================
# USER SETTINGS — EDIT HERE
# ===========================
DATA_FOLDER = r"E:/Data_for_Practice/JapMedia/data/jap_data/translated/"
FILE_PATTERN = "*topic_summary.csv"
MODEL_NAME   = "llama4"   # ← 로컬에 해당 태그가 있어야 합니다 (ollama list로 확인)
TEMP         = 0.5
NPRED        = 120        # JSON 출력 충분히 확보

# ---------------------------
# TOKENIZATION + CLEANING
# ---------------------------
_WORD_SPLIT_RE = re.compile(r"[\s\-/_,]+", re.UNICODE)
_PUNCT_TAIL_RE = re.compile(r"[\s\-–—:;,!.?\"'`]+$")

def clean_label(text: str, max_words: int = 7) -> str:
    if not text:
        return ""
    text = text.strip().strip('"').strip("'").strip("`")
    text = _PUNCT_TAIL_RE.sub("", text)
    text = re.sub(r"\s+", " ", text)
    toks = [t for t in _WORD_SPLIT_RE.split(text) if t][:max_words]

    def smart_cap(w):
        return w if (w.isupper() and len(w) <= 5) else w.capitalize()

    return " ".join(smart_cap(w) for w in toks)

# ---------------------------
# DOMAIN LEXICONS
# ---------------------------
GENERIC_FORBIDDEN = set("""issue problem topic situation things general policy measure project plan""".split())

SECTORS = set("""
power electricity grid utilities industry heavy industry cement steel chemicals
buildings housing heating cooling transport shipping aviation agriculture forestry fisheries finance
""".split())

TECH = set("""
renewables solar pv wind hydro nuclear smr battery storage heatpump ev charging hydrogen ammonia
ccus ccs cdr dac bioenergy biofuel retrofit electrification interconnection smartmeter smartgrid
""".split())

POLICY = set("""
cbam ets rps re100 esg taxonomy carbon tax carbon pricing feed-in tariff offset credits
scope1 scope2 scope3 tcfd csrd issb esrs pcf disclosure subsidy subsidies
""".split())

GAS = set("""
co2 ch4 methane n2o nox so2 pm25 pm2.5 pm10 black carbon sf6
""".split())

GEO = set("""
korea korean seoul busan incheon japan china taiwan india asean eu europe germany france italy spain uk usa u.s. america canada
""".split())

YEAR_RE = re.compile(r"\b(20[1-5]\d)\b")

# ---------------------------
# ENTITY EXTRACTION
# ---------------------------
def extract_entities(text: str):
    toks = [t.lower() for t in _WORD_SPLIT_RE.split(text) if t]
    ent = {
        "policy": [t for t in toks if t in POLICY][:3],
        "sector": [t for t in toks if t in SECTORS][:3],
        "tech":   [t for t in toks if t in TECH][:3],
        "gas":    [t for t in toks if t in GAS][:3],
        "geo":    [t for t in toks if t in GEO][:3],
        "years":  YEAR_RE.findall(text)[:3],
    }
    return ent

# ---------------------------
# LABEL QUALITY CHECKS
# ---------------------------
def is_too_generic(label: str) -> bool:
    if not label:
        return True
    toks = [t.lower() for t in _WORD_SPLIT_RE.split(label) if t]
    if len(toks) < 2:
        return True
    if any(t in GENERIC_FORBIDDEN for t in toks):
        return True
    return False

def lacks_specificity(label: str, ents: dict) -> bool:
    if not label: 
        return True
    l = label.lower()
    groups = ["policy","sector","tech","gas","geo","years"]
    hit = 0
    for g in groups:
        for t in ents[g]:
            if str(t).lower() in l:
                hit += 1; break
    return hit < 2  # 최소 두 그룹 언급되어야 충분히 구체적

# ---------------------------
# ENTITY ENRICHMENT / FALLBACK
# ---------------------------
def enrich_label_with_entities(label: str, ents: dict, max_words=7) -> str:
    base = label or "Climate Policy"
    piece = []
    if ents["policy"]:
        p = ents["policy"][0]
        piece.append(p.upper() if p in {"cbam","ets","rps","re100","csrd","issb","esrs"} else p.title())
    if ents["sector"]:
        piece.append(ents["sector"][0].title())
    if ents["tech"]:
        t = ents["tech"][0]
        piece.append(t.upper() if t in {"ccus","ccs","cdr","dac","smr"} else t.title())
    extra = ents["gas"] or ents["geo"] or ents["years"]
    if extra:
        x = extra[0]
        x = x.upper() if x.lower() in {"co2","ch4","pm2.5","pm10","nox","so2"} else x.title()
        piece.append(x)
    add = " ".join(piece[:3])
    if add and add.lower() not in base.lower():
        base = f"{base} – {add}"
    return clean_label(base, max_words)

def synthesize_label_from_entities(ents: dict) -> str:
    part = []
    if ents["policy"]:
        p = ents["policy"][0]
        part.append(p.upper() if p in {"cbam","ets","rps","re100"} else p.title())
    if ents["sector"]:
        part.append(ents["sector"][0].title())
    if ents["tech"]:
        t = ents["tech"][0]
        part.append(t.upper() if t in {"ccus","ccs","cdr","dac","smr"} else t.title())
    extra = (ents["gas"] or ents["geo"] or ents["years"])
    if extra:
        x = extra[0]
        part.append(x.upper() if x.lower() in {"co2","ch4","pm2.5","pm10"} else x.title())
    return clean_label(" ".join(part) if part else "Climate Policy Specifics", max_words=7)

# ---------------------------
# FRAME GUESS (table-based)
# ---------------------------
FRAME_KEYWORDS = {
    "Economic Costs/Benefits Frame": set("""
        subsidy subsidies tax credit credits reverse discrimination eligibility benefit benefits
        supply-chain supply chain export price prices electricity fuel insurance loss losses disaster damage damages
        competitiveness employment relocation ets carbon tax
    """.split()),

    "Technological Transition / Industrial Competition Frame": set("""
        ev battery batteries origin localization supply chain siting investment
        renewable renewables solar wind nuclear hydrogen
        ccs ccus smart grid smartgrid infrastructure storage
        industry transition
    """.split()),

    "Political Imbalance / Institutions & Geopolitics Frame": set("""
        protectionism norms diplomacy negotiation negotiations cop geopolitics partisan republican democrat
        japan korea korea-japan u.s. us trade conflict governance institution institutions regulation assembly
        summit letter letters retaliation dispute
    """.split()),

    "Climate Crisis Response & Justice Frame": set("""
        net-zero neutrality roadmap mitigation adaptation just transition justice youth citizen civic activism
        science expert experts fact-check fact-checking biodiversity disaster health model models risk risks
        lifestyle behaviour behavior
    """.split())
}
FRAME_ORDER = [
    "Technological Transition / Industrial Competition Frame",
    "Economic Costs/Benefits Frame",
    "Political Imbalance / Institutions & Geopolitics Frame",
    "Climate Crisis Response & Justice Frame"
]
def guess_frame_from_keywords_v2(text: str) -> str:
    toks = set(t.lower() for t in _WORD_SPLIT_RE.split(text) if t)
    scores = {f: len(toks & words) for f, words in FRAME_KEYWORDS.items()}
    return max(FRAME_ORDER, key=lambda f: scores[f])

# ---------------------------
# FRAME-AWARE PROMPT (표 반영) + AVOID LIST
# ---------------------------
def build_prompt_bestframe(keywords: str, previous_label: str = "", row_id=-1, avoid=None):
    avoid_text = ""
    if avoid:
        uniq = sorted({clean_label(a) for a in set(avoid) if a})[-30:]
        if uniq:
            avoid_text = "\nDo NOT reuse any of the following labels:\n- " + "\n- ".join(uniq)

    prev = (f"\nThe previous label was too generic: '{previous_label}'. Generate a more specific label."
            if previous_label else "")

    return f"""
ROW_ID: {row_id}

You must classify the topic described by the following keywords:
{keywords}

Pick ONE BEST-FITTING FRAME from the four below.
If multiple frames seem possible, choose the one MOST DIRECTLY connected to the core meaning of the keywords.

================ FRAME DEFINITIONS ================

1) Economic Costs/Benefits Frame
   - Costs/benefits of climate policy or IRA
   - subsidies, tax credits, reverse discrimination, eligibility, supply-chain disruptions, energy prices, fuel costs, disasters/losses, employment, competitiveness, carbon tax/ETS

2) Technological Transition / Industrial Competition Frame
   - Energy/industrial transition, technological innovation, investment/siting, supply chains
   - EV/battery rules of origin, localization, green investment expansion, renewables, nuclear mix, CCS, hydrogen, smart grid, grid/storage infrastructure

3) Political Imbalance / Institutions & Geopolitics Frame
   - Trade, diplomacy, institutional conflict, governance issues
   - protectionism, reverse discrimination, Japan–Korea/U.S.–Korea responses, U.S. partisan conflict, domestic politics & regulation, COP negotiations, local government conflicts

4) Climate Crisis Response & Justice Frame
   - Carbon neutrality, state intervention, corporate leadership, civic action, justice
   - net-zero roadmaps, regulation/subsidy needs, corporate strategies, youth justice action, scientific evidence/models, physical impacts (disaster/health/biodiversity), adaptation, just transition

===================================================

LABEL REQUIREMENTS:
- Provide a SPECIFIC noun phrase (≤ 7 words, Title Case, no period)
- Avoid generic terms (issue, topic, problem, situation)
- Avoid any labels listed in {avoid_text}

OUTPUT ONLY ONE LINE OF MINIFIED JSON:
{{"frame":"...","label":"...","confidence":0.0}}

{prev}
""".strip()

# ---------------------------
# MODEL CALL (JSON MODE)
# ---------------------------
def _seed_from_text(text: str, row_idx: int = 0, file_name: str = "") -> int:
    base = f"{text}|{row_idx}|{file_name}"
    h = int(hashlib.md5(base.encode("utf-8")).hexdigest()[:8], 16)
    return h & 0x7fffffff

def ask_ollama_json(prompt: str, keywords: str, row_idx: int = 0, file_name: str = ""):
    seed = _seed_from_text(keywords, row_idx=row_idx, file_name=file_name)
    r = ollama.chat(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "Return ONLY valid minified JSON, no prose."},
            {"role": "user", "content": prompt},
        ],
        options={
            "temperature": TEMP,
            "num_predict": NPRED,
            "seed": seed,
            "format": "json",
            "top_p": 0.9
        }
    )
    return json.loads(r["message"]["content"])

# ---------------------------
# GLOBAL DE-DUP (final guard)
# ---------------------------
def ensure_unique_label(label: str, ents: dict, used: set, keywords: str, max_words=7) -> str:
    cand = clean_label(label, max_words)
    if cand and cand not in used:
        used.add(cand); return cand

    c1 = enrich_label_with_entities(cand, ents, max_words)
    if c1 and c1 not in used:
        used.add(c1); return c1

    c2 = synthesize_label_from_entities(ents)
    if c2 and c2 not in used:
        used.add(c2); return c2

    suf = hashlib.md5(keywords.encode("utf-8")).hexdigest()[:4].upper()
    c3 = clean_label(f"{cand} {suf}", max_words+1)
    used.add(c3); return c3

# ---------------------------
# MAIN EXECUTION
# ---------------------------
def run_bestframe_specific():
    files = sorted(glob.glob(os.path.join(DATA_FOLDER, FILE_PATTERN)))
    if not files:
        raise FileNotFoundError(f"No files found in {DATA_FOLDER} matching {FILE_PATTERN}")

    out, used_labels = [], set()   # 전역 중복 추적

    for file in files:
        print(f"\n[Processing] {os.path.basename(file)}")
        df = pd.read_csv(file, encoding="utf-8-sig")

        rep_col = "Representation" if "Representation" in df.columns else (
                  "Represenation" if "Represenation" in df.columns else None)
        if rep_col is None:
            raise KeyError(f"No Representation column in {file}")

        for i, row in df.iterrows():
            keywords = str(row[rep_col]).strip()
            ents = extract_entities(keywords)

            frame, label, conf = "", "", ""
            used_model = False

            try:
                prompt = build_prompt_bestframe(keywords, row_id=i+1, avoid=used_labels)
                data = ask_ollama_json(prompt, keywords, row_idx=i+1, file_name=os.path.basename(file))
                frame = data.get("frame","").strip()
                label = clean_label(data.get("label",""))
                conf  = data.get("confidence","")
                used_model = True
            except Exception as e:
                print(f"   [warn] model fail row {i+1}: {e}")
                frame = guess_frame_from_keywords_v2(keywords)

            if (not label) or is_too_generic(label) or lacks_specificity(label, ents) or (label in used_labels):
                try:
                    prompt2 = build_prompt_bestframe(keywords, previous_label=label, row_id=f"{i+1}-retry", avoid=used_labels)
                    data2 = ask_ollama_json(prompt2, keywords, row_idx=i+1, file_name=os.path.basename(file))
                    frame = data2.get("frame", frame).strip()
                    label2 = clean_label(data2.get("label",""))
                    if label2:
                        label = label2
                        conf = data2.get("confidence","")
                        used_model = True
                except Exception as e:
                    print(f"   [warn] retry fail row {i+1}: {e}")

            # 전역 중복 방지 최종 단계
            label = ensure_unique_label(label, ents, used_labels, keywords)

            src = "MODEL" if used_model else "FALLBACK"
            print(f"  - {i+1}: [{frame}] {label} <{src}>")

            out.append({
                "file": os.path.basename(file),
                "Topic": row.get("Topic", ""),
                "Count": row.get("Count", ""),
                "Representation": keywords,
                "best_frame": frame,
                "best_frame_label": label,
                "confidence": conf,
                "source": src
            })

    return pd.DataFrame(out)

# ---------------------------
# RUN & SAVE
# ---------------------------
df_labels = run_bestframe_specific()
out_path = os.path.join(DATA_FOLDER, "bestframe_labels_specific_JP.csv")
df_labels.to_csv(out_path, encoding="utf-8-sig", index=False)
print(f"\n[SAVED] {out_path}")
df_labels.head()


[Processing] (영문번역 추가)_아사히신문_2022년도 데이터_topic_summary.csv
  - 1: [Climate Crisis Response & Justice Frame] Royal Alpine Skiing Event <MODEL>
  - 2: [Political Imbalance / Institutions & Geopolitics] Taiwan China US Military Threat <MODEL>
  - 3: [Climate Crisis Response & Justice Frame] Agricultural Soil Carbon Sequestration <MODEL>

[Processing] (영문번역 추가)_아사히신문_2023년도 데이터_topic_summary.csv
  - 1: [Climate Crisis Response & Justice Frame] Hydroelectric Dam Flood Risk Management <MODEL>
  - 2: [Political Imbalance / Institutions & Geopolitics] US China Diplomatic Relations <MODEL>
  - 3: [Political Imbalance / Institutions & Geopolitics] Atomic Bomb Disarmament Policy <MODEL>
  - 4: [Climate Crisis Response & Justice Frame] Climate Change Mitigation Strategies <MODEL>

[Processing] (영문번역 추가)_요미우리신문_2022년도 데이터_topic_summary.csv
  - 1: [Climate Crisis Response & Justice Frame] Disaster Response Planning <MODEL>
  - 2: [Climate Crisis Response & Justice Frame] Net Zero Emissions Roadmap <

Unnamed: 0,file,Topic,Count,Representation,best_frame,best_frame_label,confidence,source
0,(영문번역 추가)_아사히신문_2022년도 데이터_topic_summary.csv,-1,40,"['queen', 'sapporo', 'bid', 'ioc', 'tourist', ...",Climate Crisis Response & Justice Frame,Royal Alpine Skiing Event,0.8,MODEL
1,(영문번역 추가)_아사히신문_2022년도 데이터_topic_summary.csv,0,115,"['capability', 'taiwan', 'vote', 'aircraft', '...",Political Imbalance / Institutions & Geopolitics,Taiwan China US Military Threat,0.9,MODEL
2,(영문번역 추가)_아사히신문_2022년도 데이터_topic_summary.csv,1,311,"['temperature', 'celsius', 'soil', 'heat', 'me...",Climate Crisis Response & Justice Frame,Agricultural Soil Carbon Sequestration,0.9,MODEL
3,(영문번역 추가)_아사히신문_2023년도 데이터_topic_summary.csv,-1,102,"['shareholder', 'river', 'dam', 'quantum', 'ga...",Climate Crisis Response & Justice Frame,Hydroelectric Dam Flood Risk Management,0.8,MODEL
4,(영문번역 추가)_아사히신문_2023년도 데이터_topic_summary.csv,0,31,"['yellen', 'kerry', 'treasury', 'lula', 'milei...",Political Imbalance / Institutions & Geopolitics,US China Diplomatic Relations,0.8,MODEL


In [None]:
### Old prompt
ROW_ID: {row_id}

You must classify the topic described by the following keywords:
{keywords}

Choose ONE BEST-FITTING FRAME from the four frames below.

================ FRAME DEFINITIONS ================

1) Economic Costs/Benefits Frame
   - Costs/benefits of climate policy or IRA
   - IRA: subsidies, tax credits, reverse discrimination, eligibility, supply-chain disruptions
   - General: energy prices, fuel costs, disasters/losses, employment, competitiveness, carbon tax/ETS

2) Technological Transition / Industrial Competition Frame
   - Energy/industrial transition, technological innovation, investment/siting, supply chains
   - IRA: EV/battery rules of origin, localization, green investment expansion
   - General: renewables, nuclear mix, CCS, hydrogen, smart grid, grid/storage infrastructure

3) Political Imbalance / Institutions & Geopolitics Frame
   - Trade, diplomacy, institutional conflict, governance issues
   - IRA: protectionism, reverse discrimination, Japan–Korea/U.S.–Korea responses, U.S. partisan conflict
   - General: domestic politics & regulation, COP negotiations, local government conflicts

4) Climate Crisis Response & Justice Frame
   - Carbon neutrality, state intervention, corporate leadership, civic action, justice
   - IRA: net-zero roadmaps, regulation/subsidy needs, corporate strategies, youth justice action
   - General: scientific evidence/models, physical impacts (disaster/health/biodiversity), adaptation, just transition

===================================================

LABEL REQUIREMENTS:
- Output a SPECIFIC noun phrase (≤ 7 words, Title Case, no period)
- MUST include at least ONE of:
    · policy/instrument (ETS, CBAM, Carbon Tax, Subsidy)
    · sector/technology (EV, Battery, CCS, Hydrogen, Renewables, Nuclear)
    · pollutant/gas (CO2, CH4, PM2.5)
    · geography (Korea, Japan, U.S., EU)
    · year (2030, 2050)
- Avoid generic words (issue, topic, problem, situation)
- Avoid labels already used
{avoid_text}

OUTPUT ONLY ONE LINE OF MINIFIED JSON:
{{"frame":"...","label":"...","confidence":0.0}}

{prev}
