In [None]:
BASE_DIR      = "/local/scratch/group/guldigroup/climate_change/wiki_history_rosie/date_tagging_pipeline/tagged_output_2025-09-20_deduped"
SUBFOLDERS    = ["historical_objects_tagged", "history_of_ideologies_tagged", "history_of_sports_tagged"]

FILE_GLOB     = "**/*.csv"
FILENAME_COL  = "filename"
TAG_COL       = "date_tagged"
ENCODINGS     = ["utf-8", "utf-8-sig", "latin1"]

JOIN_SEP      = ", "
SAFE_CELL_LIMIT  = 30000
TARGET_LAST_MIN_RATIO = 0.15
MIN_DATES_IN_LAST     = 2

import re, csv
from pathlib import Path
from typing import List, Dict, Optional
import pandas as pd

try:
    csv.field_size_limit(2**31-1)
except Exception:
    pass

DATE_TAG_RE = re.compile(r"(?is)<\s*date\s*>(.*?)<\s*/\s*date\s*>")
TXT_END_RE  = re.compile(r"\.txt\s*$", re.IGNORECASE)

DYNASTY_YEAR_MAP = {
    "han dynasty": 202, "qin dynasty": -221, "tang dynasty": 618,
    "song dynasty": 960, "yuan dynasty": 1271, "ming dynasty": 1368, "qing dynasty": 1644,
    "tokugawa period": 1603, "edo period": 1603, "meiji era": 1868, "taisho era": 1912,
    "showa era": 1926, "heisei era": 1989, "reiwa era": 2019,
    "bronze age": -1500, "iron age": -800, "classical antiquity": -500,
    "hellenistic period": -323, "roman empire": 27, "middle ages": 1100,
    "renaissance": 1500, "enlightenment": 1700, "victorian era": 1837,
    "ottoman empire": 1453,
}

ORDINAL_WORDS = {
    "first":1,"second":2,"third":3,"fourth":4,"fifth":5,"sixth":6,"seventh":7,
    "eighth":8,"ninth":9,"tenth":10,"eleventh":11,"twelfth":12,"thirteenth":13,
    "fourteenth":14,"fifteenth":15,"sixteenth":16,"seventeenth":17,"eighteenth":18,
    "nineteenth":19,"twentieth":20,"twenty-first":21,"twenty second":22,
    "twenty third":23,"twenty fourth":24,"twenty fifth":25,"twenty sixth":26,
    "twenty seventh":27,"twenty eighth":28,"twenty ninth":29,"thirtieth":30
}

def parse_century_number(token: str):
    token = token.lower().strip()
    m = re.match(r"^(\d{1,2})(st|nd|rd|th)?$", token)
    if m: return int(m.group(1))
    return ORDINAL_WORDS.get(token)

def map_century_to_year(n: int, era: str = "", qualifier: str = "") -> int:
    off = 0 if "early" in qualifier else (50 if "mid" in qualifier else (75 if "late" in qualifier else 0))
    era = (era or "").upper()
    return (-(n*100) if era in ("BC","BCE") else (n-1)*100) + off

CENT_NUM_RE  = re.compile(r"(?:(early|mid|late)\s+)?(\d{1,2})(?:st|nd|rd|th)?\s+centur(?:y|ies)(?:\s*(BC|BCE|AD|CE))?", re.IGNORECASE)
CENT_WORD_RE = re.compile(r"(?:(early|mid|late)\s+)?([A-Za-z\- ]+)\s+centur(?:y|ies)(?:\s*(BC|BCE|AD|CE))?", re.IGNORECASE)
DECADE_RE    = re.compile(r"(?:(early|mid|late)\s+)?(?:(\d{3,4})|'\s?(\d{2}))0s\s*(BC|BCE)?", re.IGNORECASE)
YEAR_BC_RE   = re.compile(r"\b(\d{1,4})\s*(BC|BCE)\b", re.IGNORECASE)
YEAR_AD_RE   = re.compile(r"\b(\d{1,4})(?:\s*(AD|CE))?\b", re.IGNORECASE)

def extract_extra_years(s: str) -> List[str]:
    years = []
    if not s: 
        return years

    for k,v in DYNASTY_YEAR_MAP.items():
        if k in s.lower():
            years.append(str(v))

    for q,n_str,era in CENT_NUM_RE.findall(s):
        years.append(str(map_century_to_year(int(n_str), era, q)))

    for q,word,era in CENT_WORD_RE.findall(s):
        n = parse_century_number(word.strip().lower())
        if n:
            years.append(str(map_century_to_year(n, era, q)))

    for q, full, short, era in DECADE_RE.findall(s):
        year = int(full) if full else (1900+int(short) if int(short)<30 else 1800+int(short))
        if era in ("BC","BCE"): 
            year = -year
        if q.lower()=="early": year += 2
        elif q.lower()=="mid": year += 5
        elif q.lower()=="late": year += 8
        years.append(str(year))

    for y, era in YEAR_BC_RE.findall(s):
        years.append(str(-int(y)))

    for y, era in YEAR_AD_RE.findall(s):
        if era in ("AD","CE"):
            years.append(str(int(y)))
        elif not era:
            years.append(str(int(y)))

    return years

def extract_dates(cell: Optional[str]) -> List[str]:
    if not isinstance(cell, str) or not cell:
        return []
    raw = [m.strip() for m in DATE_TAG_RE.findall(cell)]
    out = []
    for r in raw:
        out.append(r)                      
        out.extend(extract_extra_years(r)) 
    return out

def read_csv_any(path: Path, encodings: List[str], cols: list) -> pd.DataFrame:
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, dtype=str, encoding=enc, usecols=lambda c: c in cols)
        except ValueError:
            try:
                return pd.read_csv(path, dtype=str, encoding=enc)
            except Exception as e2:
                last_err = e2
                continue
        except Exception as e:
            last_err = e
            continue
    print(f"[SKIP] Could not read {path} ({last_err})")
    return pd.DataFrame()

def is_valid_txt_filename(s: Optional[str]) -> bool:
    return isinstance(s, str) and bool(TXT_END_RE.search(s.strip()))

def pack_dates_into_cells(dates: List[str],
                          sep: str = ", ",
                          max_len: int = SAFE_CELL_LIMIT,
                          target_last_min_ratio: float = TARGET_LAST_MIN_RATIO,
                          min_last_items: int = MIN_DATES_IN_LAST) -> List[str]:
    if not dates:
        return [""]
    parts: List[List[str]] = [[]]
    cur_len = 0
    def piece_len(item: str, is_first: bool) -> int:
        return len(item) if is_first else len(sep) + len(item)
    for d in dates:
        is_first = (len(parts[-1]) == 0)
        add_len = piece_len(d, is_first)
        if cur_len + add_len <= max_len:
            parts[-1].append(d)
            cur_len += add_len
        else:
            parts.append([d])
            cur_len = len(d)
    if len(parts) == 1:
        return [sep.join(parts[0])]
    target_min_len = int(max_len * target_last_min_ratio)
    last = parts[-1]; prev = parts[-2]
    def joined_length(items: List[str]) -> int:
        if not items: return 0
        total = len(items[0])
        for it in items[1:]:
            total += len(sep) + len(it)
        return total
    if (joined_length(last) < target_min_len) or (len(last) < min_last_items):
        while prev and ((joined_length(last) < target_min_len) or (len(last) < min_last_items)):
            candidate = prev[-1]
            cand_add = piece_len(candidate, is_first=(len(last) == 0))
            if joined_length(last) + cand_add <= max_len:
                last.insert(0, candidate)
                prev.pop()
            else:
                break
        parts = [p for p in parts if p]
    return [sep.join(p) for p in parts]

for sub in SUBFOLDERS:
    root = Path(BASE_DIR) / sub
    files = sorted(root.glob(FILE_GLOB))
    print(f"\n[INFO] Folder={sub} 发现 {len(files)} CSV 文件")

    fn_to_dates: Dict[str, List[str]] = {}

    for f in files:
        df = read_csv_any(f, ENCODINGS, [FILENAME_COL, TAG_COL])
        if df.empty: continue
        if FILENAME_COL not in df.columns or TAG_COL not in df.columns: continue
        df = df[df[FILENAME_COL].apply(is_valid_txt_filename)]
        for _, r in df.iterrows():
            fn = r[FILENAME_COL].strip()
            ds = extract_dates(r.get(TAG_COL, ""))
            if not ds: continue
            fn_to_dates.setdefault(fn, []).extend(ds)

    packed_records = []
    max_parts = 0
    for fn, dl in fn_to_dates.items():
        parts = pack_dates_into_cells(dl, sep=JOIN_SEP, max_len=SAFE_CELL_LIMIT)
        max_parts = max(max_parts, len(parts))
        packed_records.append((fn, parts))

    columns = ["filename"] + [f"dates_part{i}" for i in range(1, max_parts+1)]
    rows = []
    for fn, parts in packed_records:
        row = {"filename": fn}
        for i in range(max_parts):
            row[f"dates_part{i+1}"] = parts[i] if i < len(parts) else ""
        rows.append(row)
    out_df = pd.DataFrame(rows, columns=columns).sort_values("filename")

    desktop = Path.home() / "Desktop"
    packed_file = desktop / f"filename_dates_packed_{sub}.csv"
    out_df.to_csv(packed_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)
    print(f"[OK] Packed → {packed_file}")

    parsed_rows = []
    for fn, dl in fn_to_dates.items():
        years = []
        for d in dl:
            years.extend(extract_extra_years(d))
        parsed_rows.append({"filename": fn, "parsed_years": ", ".join(years)})
    parsed_df = pd.DataFrame(parsed_rows).sort_values("filename")
    parsed_file = desktop / f"filename_dates_parsed_{sub}.csv"
    parsed_df.to_csv(parsed_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)
    print(f"[OK] Parsed → {parsed_file}")