In [None]:
# ===============================
# Stage 2 (Hybrid)
# ===============================


!pip -q install pandas numpy nltk pyarrow flashtext openai

from google.colab import drive; drive.mount('/content/drive', force_remount=True)

import os, re, json, unicodedata, hashlib, math, time
from pathlib import Path
from typing import Dict, Tuple, List, Set
import pandas as pd
import numpy as np
import nltk; nltk.download("punkt", quiet=True)
import nltk; nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor
from openai import OpenAI
from google.colab import userdata

# ---------- Config ----------
COMPANIES_ROOT    = "<<Google Drive Folder>>/FORTUNE 500/"
UNIFIED_FILE      = "/unified_keywords.csv"  # pillar,canonical_concept,keyword
COMPANY_SIC_FILE  = "Fortune500_with_SIC.csv"
CACHE_FILE        = "<<Google Drive Folder>>/llm_quant_cache.parquet"  # persistent cache for quant/gran results

# Folder subset; "" = all (e.g., "001-030,050-200,201,205-208")
FOLDER_SELECT  = "001-500"

# Filters; [] = none
YEAR_LIST   = []             # e.g., [2022, 2023]
REPORT_TYPE = []             # e.g., ["AR","10K"]

# ✅ Resumable: if True, skip any doc whose Stage-2 JSON already exists
ONLY_MISSING = True

# LLM settings (only used for Quant/Granularity)
LLM_MODEL          = "gpt-3.5-turbo"
LLM_TEMPERATURE    = 0
LLM_MAX_OUT_TOKENS = 3_000
PROMPT_VERSION     = "v1.0"           # bump to invalidate cache safely

# Safety caps
MAX_SENT_PER_DOC   = 5000
MIN_SENT_LEN       = 25
MAX_SPAN_CHARS     = 320
BATCH_MAX_ITEMS    = 80

# ---------- OpenAI ----------
os.environ["OPENAI_API_KEY"] = userdata.get("OpenAPI")
client = OpenAI()

# ---------- Divisions (fallback) ----------
DIVISIONS = [
    ("A","Agriculture, Forestry & Fishing"), ("B","Mining"), ("C","Construction"),
    ("D","Manufacturing"), ("E","Transportation & Public Utilities"),
    ("F","Wholesale Trade"), ("G","Retail Trade"),
    ("H","Finance, Insurance & Real Estate"), ("I","Services"), ("J","Public Administration"),
]
DIV_MAP = {c:n for c,n in DIVISIONS}

# ---------- Regex / detectors ----------
FOOTNOTE_LEAD = re.compile(r"^\s*(?:\(?\d+\)?|\[\d+\]|[¹²³⁴⁵⁶⁷⁸⁹])\s+")
TOC_LINE      = re.compile(r"\.{3,}\s*\d{1,4}$")
PAGINATION    = re.compile(r"\b(page|pp\.|p\.|appendix|annex|exhibit|figure|table|chapter|section|see page)\b", re.I)
DATE_HINT     = re.compile(r"\b(19|20)\d{2}\b|\b(?:jan|feb|mar|apr|may|jun|jul|aug|sept?|oct|nov|dec|fy|q[1-4])\b", re.I)
NUM_PAT       = re.compile(r"[+-]?\d{1,3}(?:[,\s]\d{3})*(?:\.\d+)?|[+-]?\d+(?:\.\d+)?")
METRIC_HINT   = re.compile(
    r"""(?ix)
    (tco2e?|co2e?|ghg|emission|scope\s*[123]|carbon|methane|ch4|
     kwh|mwh|gwh|wh|gj|mj|energy|electricity|renewable|
     m3|m2|m²|lit(?:re|er)s?|water|withdrawals?|discharge|
     kg|t|tonnes?|metric\s*tons?|waste|recycled|landfill|
     incident[s]?|injur(?:y|ies)|fatalit(?:y|ies)|spills?|leaks?|
     fine[s]?|penalt(?:y|ies)|hours|training|turnover|absenteeism|
     diversity|board|pay|remuneration|audit|briber|ethic|
     usd|aud|eur|cad|sgd|yen|cny|inr|zar|mxn|brl|sek|nok|dkk|
     us\$|a\$|\$|£|¥|€)
    """
)
WINDOW_METRIC = re.compile(r"(?:^|\b)(?:rate|intensity|emissions?|energy|water|waste|injur|incident|fine|penalt|spent|invested|allocated|revenue|cost|capex)\b", re.I)
SPECIFIC_PAT  = re.compile(
    r"""(?ix)\b(from|to|versus|vs\.?|compared|baseline|yoy|year[-\s]?on[-\s]?year|
       reduc(?:e|ed)|increas(?:e|ed)|improv(?:e|ed)|decreas(?:e|ed)|
       drop(?:ped)?|rose|grew|cut|achiev(?:e|ed))\b"""
)

def normalize_text(s: str) -> str:
    s = unicodedata.normalize("NFKC", s.lower())
    s = s.replace("_", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def clean_for_quant(s: str) -> str:
    s = FOOTNOTE_LEAD.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------- Keywords ----------
def load_concepts(path=UNIFIED_FILE):
    try:
        df = pd.read_csv(path, usecols=["pillar","canonical_concept","keyword"]).dropna()
        df["pillar"] = df["pillar"].astype(str).str.strip()
        df["canonical_concept"] = df["canonical_concept"].astype(str).str.strip()
        df["keyword"] = df["keyword"].astype(str).str.strip()
        return df
    except FileNotFoundError:
        print(f"[ERROR] Keyword file not found at: {path}. Please ensure Google Drive is mounted and the file exists.")
        return pd.DataFrame()
    except Exception as e:
        print(f"[ERROR] Failed reading keyword CSV '{path}': {e}. Please check file encoding and integrity. Ensure Google Drive is mounted.")
        return pd.DataFrame()


def build_keyword_processor(df_keywords: pd.DataFrame) -> Tuple[KeywordProcessor, Dict[str, Tuple[str,str,str]], Dict[str, Dict[str, Set[str]]]]:
    kp = KeywordProcessor(case_sensitive=False)
    keymap = {}
    concepts_index: Dict[str, Dict[str, Set[str]]] = {}

    for _, r in df_keywords.iterrows():
        pillar = str(r["pillar"]).strip()
        concept = str(r["canonical_concept"]).strip()
        kw = str(r["keyword"]).strip()
        if not pillar or not concept or not kw:
            continue

        base = normalize_text(kw)
        variants: Set[str] = set([base])
        variants.add(base.replace("-", " "))
        variants.add(base.replace(" ", "-"))
        variants.add(base.replace(" ", ""))  # occasional OCR joining
        if not base.endswith("s") and len(base.split())==1 and len(base)>=4:
            variants.add(base + "s")

        concepts_index.setdefault(pillar, {}).setdefault(concept, set()).add(base)

        for v in variants:
            if not v: continue
            kp.add_keyword(v, v)
            keymap[v] = (pillar, concept, kw)

    return kp, keymap, concepts_index

def empty_concept_state():
    return {
        "count":0, "examples":[],
        "qual_examples":[], "qgen_examples":[], "qspec_examples":[],
        "n_qual":0,"n_qgen":0,"n_qspec":0,
        "labels":set(), "final_label":"None"
    }

# ---------- Stage1 readers ----------
def _resolve(path_str: str, base_dir: Path) -> str:
    if not path_str or not isinstance(path_str, str):
        return ""
    p = Path(path_str)
    if p.is_absolute():
        return str(p)
    return str((base_dir / p).resolve())

def read_sentences(row):
    base_dir = Path(row.get("__base_dir__", ""))  # per-company stage dir (esg_stage or esg_stage1)
    s = row.get("sentences_path"); t = row.get("text_path")
    sents=[]
    if s and isinstance(s,str):
        s_abs = _resolve(s, base_dir)
        if os.path.exists(s_abs):
            try:
                df = pd.read_parquet(s_abs, columns=["text"])
                sents = [str(x).strip() for x in df["text"].tolist()]
            except Exception as e:
                print(f"[WARN] read_parquet failed, fallback to text_path. {e}")
    if not sents and t and isinstance(t,str):
        t_abs = _resolve(t, base_dir)
        if os.path.exists(t_abs):
            txt=open(t_abs,"r",encoding="utf-8").read()
            txt=re.sub(r"\s+"," ",txt).strip()
            sents=[x.strip() for x in sent_tokenize(txt)]
    sents=[x for x in sents if len(x)>=MIN_SENT_LEN]
    return sents[:MAX_SENT_PER_DOC] if MAX_SENT_PER_DOC else sents

# ---------- Company → SIC (from Fortune500_with_SIC.csv) ----------
# Name normaliser to make CSV names and filename prefixes comparable
_CORP_SUFFIXES = r"(incorporated|inc|corp|corporation|co|company|ltd|llc|plc|holdings?|group|limited)"
_CORP_SUFFIX_RE = re.compile(rf"\b{_CORP_SUFFIXES}\.?$", re.I)

def normalize_company_name(name: str) -> str:
    s = normalize_text(name)
    s = re.sub(r"[^\w\s]", " ", s)            # remove punctuation
    s = re.sub(r"\s+", " ", s).strip()
    # remove trailing corporate suffix (one pass)
    s = _CORP_SUFFIX_RE.sub("", s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

def load_company_sic_map(path: str) -> Dict[str, Tuple[str,str]]:
    """
    CSV columns: Company, SIC Division Code, SIC Division Name
    """
    m = {}
    if not path or not os.path.exists(path):
        print(f"[WARN] SIC file not found at: {path}. Will fallback to Services (I).")
        return m
    try:
        df = pd.read_csv(path, dtype=str, encoding='latin1').fillna("")
    except Exception as e:
        print(f"[WARN] Failed reading SIC CSV '{path}': {e}. Fallback to empty map.")
        return m

    required = {"Company","SIC Division Code","SIC Division Name"}
    missing = required - set(df.columns)
    if missing:
        print(f"[WARN] SIC CSV missing columns: {missing}. Fallback to empty map.")
        return m

    cnt = 0
    for _, r in df.iterrows():
        company = normalize_company_name(r["Company"])
        code = str(r["SIC Division Code"]).strip().upper()
        name = str(r["SIC Division Name"]).strip()
        if company and code and name:
            m[company] = (code, name); cnt += 1
    print(f"[SIC] Loaded {cnt} company→SIC mappings from {path}")
    return m

COMPANY_SIC = load_company_sic_map(COMPANY_SIC_FILE)

def company_from_filename(name: str) -> str:
    if not name: return ""
    stem = Path(name).stem
    return stem.split("_")[0].strip()

def get_company_sic_by_name(filename: str) -> Tuple[str,str]:
    cname_raw = company_from_filename(filename)
    cname = normalize_company_name(cname_raw)
    if cname in COMPANY_SIC:
        return COMPANY_SIC[cname]
    # Try a second-chance variant: remove "the" prefix or extra spaces
    cname2 = re.sub(r"^\bthe\b\s+", "", cname).strip()
    if cname2 in COMPANY_SIC:
        return COMPANY_SIC[cname2]
    # Fallback
    return ("I", "Services")

# ---------- LLM cache ----------
def load_cache(path: str) -> Dict[str, Dict]:
    if os.path.exists(path):
        try:
            df = pd.read_parquet(path)
            out = {}
            for _, r in df.iterrows():
                out[r["key"]] = {"quant": bool(r["quant"]), "granularity": r["granularity"]}
            return out
        except Exception as e:
            print(f"[WARN] Failed to read cache, starting fresh: {e}")
    return {}

def save_cache(cache: Dict[str, Dict], path: str):
    if not cache:
        return
    rows = [{"key":k, "quant":v.get("quant", False), "granularity": v.get("granularity","")} for k,v in cache.items()]
    df = pd.DataFrame(rows).drop_duplicates("key")
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path, index=False)

LLM_CACHE = load_cache(CACHE_FILE)

def sent_key_for_cache(text: str) -> str:
    base = normalize_text(text)[:MAX_SPAN_CHARS]
    h = hashlib.sha1((PROMPT_VERSION + "|" + LLM_MODEL + "|" + base).encode("utf-8")).hexdigest()
    return h

# ---------- LLM triage ----------
def triage_quant_candidates(sentences: List[str]) -> Dict[str, str]:
    cand = {}
    for idx, s in enumerate(sentences):
        s_clean = clean_for_quant(s)
        s_norm  = normalize_text(s_clean)

        if TOC_LINE.search(s_norm):
            continue
        if PAGINATION.search(s_norm) and not METRIC_HINT.search(s_norm):
            continue

        has_num = bool(NUM_PAT.search(s_norm))
        if not has_num:
            continue
        if DATE_HINT.search(s_norm) and not (METRIC_HINT.search(s_norm) or WINDOW_METRIC.search(s_norm)):
            continue

        # compact span
        m = NUM_PAT.search(s_clean)
        span = s_clean
        if m:
            start = max(0, m.start() - 140)
            end   = min(len(s_clean), m.end() + 140)
            span  = s_clean[start:end]
        span = span[:MAX_SPAN_CHARS].strip()
        if not span:
            continue
        sid = f"S{idx}"
        cand[sid] = span
    return cand

# ---------- LLM batches for Quant & Granularity ----------
def llm_classify_quant_gran(batches: List[Dict[str,str]]) -> Dict[str, Dict[str, str]]:
    results = {}

    def build_prompt(block: Dict[str,str]) -> str:
        lines = [f"{k}: {v}" for k, v in block.items()]
        items = "\n".join(lines)
        instr = (
            "For each line, classify if it's Quantitative and its Granularity.\n"
            "Rules:\n"
            "- Quantitative = contains a numeric value tied to a non-date metric (%, tCO2e, GWh, $, incident rate, hours, etc.). "
            "Dates/quarters/pages alone are NOT quantitative.\n"
            "- Granularity = 'Specific' if it states change/comparison/baseline (e.g., 'from X to Y', 'vs', YoY, 'decreased by %'); "
            "'General' for a single level/target without explicit comparison.\n"
            "Return ONLY a JSON array of objects with fields: id, quant (true/false), and granularity ('General'|'Specific' or omit if quant=false).\n"
            "Examples output: "
            "[{\"id\":\"S1\",\"quant\":true,\"granularity\":\"General\"},{\"id\":\"S2\",\"quant\":true,\"granularity\":\"Specific\"}]"
        )
        return instr + "\n\n" + items

    # Build pool & apply cache
    all_items = {}
    for block in batches:
        for sid, span in block.items():
            ckey = sent_key_for_cache(span)
            all_items[sid] = (ckey, span)

    # Prefill from cache
    for sid, (ckey, _) in all_items.items():
        if ckey in LLM_CACHE:
            results[sid] = {"quant": bool(LLM_CACHE[ckey]["quant"]), "granularity": LLM_CACHE[ckey]["granularity"]}

    # Remaining blocks
    remaining_blocks = []
    cur = {}
    for sid, (ckey, span) in all_items.items():
        if sid in results:
            continue
        if len(cur) >= BATCH_MAX_ITEMS:
            remaining_blocks.append(cur); cur = {}
        cur[sid] = span
    if cur:
        remaining_blocks.append(cur)

    # Send to LLM
    for block in remaining_blocks:
        prompt = build_prompt(block)
        tries = 0
        while True:
            tries += 1
            try:
                resp = client.chat.completions.create(
                    model=LLM_MODEL,
                    temperature=LLM_TEMPERATURE,
                    messages=[{"role":"user","content": prompt}],
                    max_tokens=LLM_MAX_OUT_TOKENS
                )
                txt = resp.choices[0].message.content.strip()
                data = json.loads(txt)
                if not isinstance(data, list):
                    data = data.get("items", [])
                for obj in data:
                    sid = obj.get("id")
                    if not sid or sid not in block:
                        continue
                    quant = bool(obj.get("quant", False))
                    gran  = ""
                    if quant:
                        g = str(obj.get("granularity","")).strip().lower()
                        gran = "Specific" if g.startswith("spec") else "General"
                    results[sid] = {"quant": quant, "granularity": gran}
                    ckey = sent_key_for_cache(block[sid])
                    LLM_CACHE[ckey] = {"quant": quant, "granularity": gran}
                break
            except Exception as e:
                if tries >= 2:
                    # Rules fallback for this block
                    for sid, span in block.items():
                        s = clean_for_quant(span)
                        has_num = bool(NUM_PAT.search(s))
                        metricish = bool(METRIC_HINT.search(s) or WINDOW_METRIC.search(s))
                        quant = (has_num and metricish) and not (DATE_HINT.search(s) and not METRIC_HINT.search(s))
                        gran = "Specific" if (quant and SPECIFIC_PAT.search(s)) else ("General" if quant else "")
                        results[sid] = {"quant": quant, "granularity": gran}
                        ckey = sent_key_for_cache(span)
                        LLM_CACHE[ckey] = {"quant": quant, "granularity": gran}
                    print(f"[WARN] LLM batch failed, rules fallback used. Error: {e}")
                    break
                time.sleep(1.2)

    save_cache(LLM_CACHE, CACHE_FILE)
    return results

# ---------- Analyze single document (hybrid) ----------
def analyze_doc(sentences: List[str], kp: KeywordProcessor, keymap: Dict[str, Tuple[str,str,str]], concepts_index: Dict[str, Dict[str, Set[str]]], precomputed_div=None):
    if not sentences:
        return {"sic_division":None,"sentences":0,"coverage":{},"status_by_pillar":{}}

    if precomputed_div:
        div_code, div_name = precomputed_div
    else:
        div_code, div_name = ("I","Services")

    coverage = {p: { cc: empty_concept_state() for cc in concepts_index[p].keys() } for p in concepts_index.keys()}

    fired_by_sentence: Dict[int, List[Tuple[str,str]]] = {}
    llm_candidates_order: List[Tuple[int, str]] = []

    for i, s in enumerate(sentences):
        s_norm = normalize_text(s)
        if TOC_LINE.search(s_norm):
            continue
        if PAGINATION.search(s_norm) and not METRIC_HINT.search(s_norm):
            continue

        matches = kp.extract_keywords(s_norm)
        if not matches:
            continue

        cset: List[Tuple[str,str]] = []
        for mv in matches:
            p, cc, _ = keymap.get(mv, (None, None, None))
            if p and cc:
                cset.append((p, cc))
        if not cset:
            continue

        fired_by_sentence[i] = cset

        triaged = triage_quant_candidates([s])
        if triaged:
            sid = list(triaged.keys())[0]
            span = triaged[sid]
            llm_candidates_order.append((i, span))

    # Batch LLM
    blocks, cur = [], {}
    for i, span in llm_candidates_order:
        sid = f"S{i}"
        if len(cur) >= BATCH_MAX_ITEMS:
            blocks.append(cur); cur = {}
        cur[sid] = span
    if cur:
        blocks.append(cur)

    llm_results = llm_classify_quant_gran(blocks) if blocks else {}

    # Stitch into coverage
    for i, concepts in fired_by_sentence.items():
        s = sentences[i]
        sid = f"S{i}"
        quant = False
        gran  = ""
        if sid in llm_results:
            quant = bool(llm_results[sid]["quant"])
            gran  = llm_results[sid]["granularity"]

        for p, cc in concepts:
            if p not in coverage or cc not in coverage[p]:
                coverage.setdefault(p, {}).setdefault(cc, empty_concept_state())
            st = coverage[p][cc]
            st["count"] += 1
            if quant:
                if gran == "Specific":
                    st["n_qspec"] += 1
                    if len(st["qspec_examples"])<3: st["qspec_examples"].append(s)
                    st["labels"].add("Quantitative (Specific)")
                else:
                    st["n_qgen"] += 1
                    if len(st["qgen_examples"])<3: st["qgen_examples"].append(s)
                    st["labels"].add("Quantitative (General)")
            else:
                st["n_qual"] += 1
                if len(st["qual_examples"])<3: st["qual_examples"].append(s)
                st["labels"].add("Qualitative")
            if len(st["examples"])<3 and s not in st["examples"]:
                st["examples"].append(s)

    for p,d in coverage.items():
        for cc,st in d.items():
            if   "Quantitative (Specific)" in st["labels"]: st["final_label"]="Quantitative (Specific)"
            elif "Quantitative (General)" in st["labels"]:  st["final_label"]="Quantitative (General)"
            elif "Qualitative" in st["labels"]:             st["final_label"]="Qualitative"
            else:                                           st["final_label"]="None"
            st["labels"] = list(st["labels"])

    status = {p:{
        "satisfied":[cc for cc,st in d.items() if st["count"]>0],
        "missing"  :[cc for cc,st in d.items() if st["count"]==0]
    } for p,d in coverage.items()}

    return {
        "sic_division":{"code":div_code,"name":div_name},
        "sentences":len(sentences),
        "coverage":coverage,
        "status_by_pillar":status
    }

# ---------- Folder-selection ----------
def parse_folder_select(selector: str):
    if not selector or not str(selector).strip():
        return None
    keep = set()
    for tok in re.split(r"[,\s]+", selector.strip()):
        if not tok: continue
        m = re.fullmatch(r"(\d{3})-(\d{3})", tok)
        if m:
            a, b = int(m.group(1)), int(m.group(2))
            if a <= b: keep.update(range(a, b + 1))
            else:      keep.update(range(b, a + 1))
            continue
        m2 = re.fullmatch(r"(\d{3})", tok)
        if m2:
            keep.add(int(m2.group(1)))
    keep = {x for x in keep if 1 <= x <= 500}
    return keep or None

def extract_code_from_company_dirname(name: str):
    m = re.match(r"^\s*(\d{3})\b", name or "")
    return int(m.group(1)) if m else None

# ---------- Manifest discovery ----------
companies_root = Path(COMPANIES_ROOT)
allowed_codes = parse_folder_select(FOLDER_SELECT)

company_dirs = [d for d in companies_root.iterdir() if d.is_dir()]
print(f"Company directories found under COMPANIES_ROOT: {len(company_dirs)}")

frames = []
manifests_found = 0
checked = 0

for cdir in sorted(company_dirs):
    checked += 1
    code = extract_code_from_company_dirname(cdir.name)
    if code is None:
        continue
    if allowed_codes is not None and code not in allowed_codes:
        continue

    stage_dir = None
    mpath = None
    for candidate in ("esg_stage1(ver4)", "esg_stage1(ver3)"):
        cand_dir = cdir / candidate
        cand_manifest = cand_dir / "manifest.parquet"
        if cand_manifest.exists():
            stage_dir = cand_dir
            mpath = cand_manifest
            break
    if stage_dir is None:
        continue

    try:
        df = pd.read_parquet(mpath)
    except Exception as e:
        print(f"[WARN] Could not read manifest for {cdir.name}: {e}")
        continue

    df = df[df["status"].isin(["ok_native","ok_ocr","ok_hybrid","ok"])].copy()
    if df.empty:
        continue

    df["company_code"] = code
    df["__base_dir__"] = str(stage_dir.resolve())
    df["__company_dir__"] = str(cdir.resolve())
    frames.append(df)
    manifests_found += 1

print(f"Companies scanned (after name check & FOLDER_SELECT): {checked}")
print(f"Manifests found: {manifests_found}")

if not frames:
    raise FileNotFoundError(
        "No manifest.parquet found under selected companies. "
        "Checked both 'esg_stage' and 'esg_stage1'. "
        "Verify COMPANIES_ROOT path and folder names like '001. Walmart'."
    )

man = pd.concat(frames, ignore_index=True)
if YEAR_LIST:   man = man[man["doc_year"].isin(YEAR_LIST)]
if REPORT_TYPE: man = man[man["report_type"].isin(REPORT_TYPE)]
man = man.reset_index(drop=True)

print(f"Companies selected: {sorted(set(man['company_code']))}")
print(f"After selection, docs to process: {len(man)}")

# ---------- Output path ----------
def out_path(row):
    company_dir = Path(row.get("__company_dir__", ""))        # .../FORTUNE 500/001. Walmart
    out_dir = company_dir / "esg_stage2_18(hybridv5)"
    did = (
        str(row["sha256_16"])
        if pd.notna(row.get("sha256_16"))
        else Path(str(row.get("text_path") or row.get("pdf_path") or row.get("filename") or "doc")).stem
    )
    out_dir.mkdir(parents=True, exist_ok=True)
    return out_dir / f"esg_stage2_{did}.json"

# ---------- Prepare keywords ----------
df_kw = load_concepts(UNIFIED_FILE)
if df_kw.empty:
    raise FileNotFoundError(f"No keywords found in {UNIFIED_FILE}. Expect columns: pillar, canonical_concept, keyword.")
kp, keymap, concepts_index = build_keyword_processor(df_kw)

# ---------- Main loop ----------
proc=skip=0
for _,row in man.iterrows():
    op = out_path(row)
    if ONLY_MISSING and op.exists():
        skip += 1
        continue

    filename = str(row.get("filename") or "")
    div_code, div_name = get_company_sic_by_name(filename)

    sents = read_sentences(row.to_dict())
    res   = analyze_doc(
        sentences=sents,
        kp=kp,
        keymap=keymap,
        concepts_index=concepts_index,
        precomputed_div=(div_code, div_name)
    )
    res["meta"] = {
        "filename": row.get("filename"),
        "doc_year": int(row.get("doc_year")) if pd.notna(row.get("doc_year")) else None,
        "report_type": row.get("report_type"),
        "company_code": int(row.get("company_code")) if pd.notna(row.get("company_code")) else None,
        "company_dir": row.get("__company_dir__", ""),
        "stage1_dir": row.get("__base_dir__", "")
    }
    with open(op,"w",encoding="utf-8") as f: json.dump(res,f,ensure_ascii=False,indent=2)
    proc+=1
    print(f"Processed {proc}: wrote {op}")

print(f"✅ Done. Wrote {proc}; skipped {skip}. Results saved in each company/esg_stage2_18(hybridv5)/")