In [None]:
import json
import pandas as pd
from collections import defaultdict
from typing import List, Dict, Optional
from tqdm import tqdm

POLY_PATH = "polylex_with_ud_stanza.xlsx"
CORPUS_PATH = "corpus_sent_level_with_ud_stanza.xlsx"
OUT_PATH = "corpus_with_projected_mwes.xlsx"

def split_str(s):
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return []
    s = str(s).strip()
    if not s:
        return []
    return s.split()

def parse_heads(s):
    return [int(x) for x in split_str(s)]

def parse_ud_row(row):
    tokens = split_str(row.get("ud_tokens", ""))
    lemmas = split_str(row.get("ud_lemmas", ""))
    upos   = split_str(row.get("ud_upos", ""))
    deprel = split_str(row.get("ud_deprel", ""))
    heads  = parse_heads(row.get("ud_heads", ""))

    n = max(len(lemmas), len(upos), len(deprel), len(heads))
    if len(tokens) != n:
        tokens = ["_"] * n
    if len(lemmas) != n:
        lemmas = (lemmas + ["_"] * n)[:n]
    if len(upos) != n:
        upos = (upos + ["_"] * n)[:n]
    if len(deprel) != n:
        deprel = (deprel + ["_"] * n)[:n]
    if len(heads) != n:
        heads = (heads + [0] * n)[:n]

    deps = defaultdict(list)
    for i in range(1, n + 1):
        h = heads[i - 1]
        deps[h].append(i)

    return {
        "n": n,
        "tokens": tokens,
        "lemmas": lemmas,
        "upos": upos,
        "deprel": deprel,
        "heads": heads,
        "deps": deps
    }

def build_pattern(poly_row):
    ud = parse_ud_row(poly_row)
    n = ud["n"]
    if n == 0:
        return None

    roots = [i for i in range(1, n + 1) if ud["heads"][i - 1] == 0]
    if not roots:
        roots = [i for i in range(1, n + 1) if ud["deprel"][i - 1] == "root"]
    if not roots:
        roots = [1]

    edges = []
    for i in range(1, n + 1):
        h = ud["heads"][i - 1]
        if h != 0:
            edges.append((i, h, ud["deprel"][i - 1]))

    return {
        "expression": str(poly_row.get("expression", "")),
        "n": n,
        "lemmas": ud["lemmas"],
        "upos": ud["upos"],
        "deprel": ud["deprel"],
        "heads": ud["heads"],
        "roots": roots,
        "edges": edges,
    }

def find_matches(pattern, sent_ud, max_matches_per_sent=50):
    pn = pattern["n"]
    sn = sent_ud["n"]
    if pn == 0 or sn == 0 or pn > sn:
        return []

    cand_by_key = defaultdict(list)
    for j in range(1, sn + 1):
        key = (sent_ud["lemmas"][j - 1], sent_ud["upos"][j - 1])
        cand_by_key[key].append(j)

    candidates = {}
    for i in range(1, pn + 1):
        key = (pattern["lemmas"][i - 1], pattern["upos"][i - 1])
        candidates[i] = cand_by_key.get(key, [])
        if not candidates[i]:
            return []

    order = sorted(range(1, pn + 1), key=lambda i: len(candidates[i]))

    pat_head = {i: pattern["heads"][i - 1] for i in range(1, pn + 1)}
    pat_depr = {i: pattern["deprel"][i - 1] for i in range(1, pn + 1)}

    child_constraints = {}
    for i in range(1, pn + 1):
        h = pat_head[i]
        if h != 0:
            child_constraints[i] = (h, pat_depr[i])

    used_sent_nodes = set()
    mapping = {}
    matches = []

    def ok_partial(i):
        if i in child_constraints:
            h_pat, rel = child_constraints[i]
            if h_pat in mapping:
                j_child = mapping[i]
                j_head = mapping[h_pat]
                if sent_ud["heads"][j_child - 1] != j_head:
                    return False
                if sent_ud["deprel"][j_child - 1] != rel:
                    return False
        return True

    def backtrack(k):
        if len(matches) >= max_matches_per_sent:
            return
        if k == len(order):
            mapped_ids = sorted(mapping.values())
            matches.append({
                "expression": pattern["expression"],
                "token_ids_1based": mapped_ids,
                "span_minmax_1based": [mapped_ids[0], mapped_ids[-1]],
            })
            return

        i = order[k]
        for j in candidates[i]:
            if j in used_sent_nodes:
                continue
            mapping[i] = j
            used_sent_nodes.add(j)
            if ok_partial(i):
                backtrack(k + 1)
            used_sent_nodes.remove(j)
            del mapping[i]

    backtrack(0)

    seen = set()
    uniq = []
    for m in matches:
        key = tuple(m["token_ids_1based"])
        if key not in seen:
            seen.add(key)
            uniq.append(m)
    return uniq

def main():
    corpus_df = pd.read_excel(CORPUS_PATH)
    poly_df = pd.read_excel(POLY_PATH)

    patterns = []
    for _, r in tqdm(poly_df.iterrows(), total=len(poly_df), desc="Building patterns"):
        pat = build_pattern(r)
        if pat and pat["n"] > 0 and pat["expression"]:
            patterns.append(pat)

    print(f"Loaded {len(patterns)} patterns from PolyLex.")

    projected_all = []
    counts = []

    for _, row in tqdm(corpus_df.iterrows(),
                       total=len(corpus_df),
                       desc="Projecting MWEs"):
        sent_ud = parse_ud_row(row)
        sent_matches = []

        sent_keys = set(zip(sent_ud["lemmas"], sent_ud["upos"]))

        for pat in patterns:
            pat_keys = set(zip(pat["lemmas"], pat["upos"]))
            if not pat_keys.issubset(sent_keys):
                continue
            ms = find_matches(pat, sent_ud)
            if ms:
                sent_matches.extend(ms)

        projected_all.append(json.dumps(sent_matches, ensure_ascii=False))
        counts.append(len(sent_matches))

    corpus_df["projected_mwes"] = projected_all
    corpus_df["projected_mwes_count"] = counts
    corpus_df.to_excel(OUT_PATH, index=False)
    print(f"Saved: {OUT_PATH}")

if __name__ == "__main__":
    main()


Building patterns: 100%|██████████| 2260/2260 [00:00<00:00, 13667.74it/s]


Loaded 2260 patterns from PolyLex.


Projecting MWEs: 100%|██████████| 49263/49263 [01:40<00:00, 491.44it/s]


Saved: corpus_with_projected_mwes.xlsx


In [None]:
import json
import pandas as pd
from collections import defaultdict
from typing import Dict, List, Optional
from tqdm import tqdm

POLY_PATH = "polylex_with_ud_stanza.xlsx"
CORPUS_PATH = "corpus_sent_level_with_ud_stanza.xlsx"
OUT_PATH = "projected_mwes_by_expression.xlsx"


# ----------------------------
# Helpers: robust column picking
# ----------------------------
def pick_first_existing(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    cols = list(df.columns)
    lower_map = {c.lower(): c for c in cols}
    for cand in candidates:
        if cand.lower() in lower_map:
            return lower_map[cand.lower()]
    return None

def pick_by_contains(df: pd.DataFrame, substrings: List[str]) -> Optional[str]:
    cols = list(df.columns)
    for c in cols:
        cl = c.lower()
        if any(s in cl for s in substrings):
            return c
    return None

def split_str(s) -> List[str]:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return []
    s = str(s).strip()
    if not s:
        return []
    return s.split()

def parse_heads(s) -> List[int]:
    return [int(x) for x in split_str(s)]


# ----------------------------
# UD parsing (expects whitespace-joined sequences in columns)
# ----------------------------
def parse_ud_row(row: pd.Series,
                 col_tokens: str,
                 col_lemmas: str,
                 col_upos: str,
                 col_deprel: str,
                 col_heads: str) -> Dict:

    tokens = split_str(row.get(col_tokens, "")) if col_tokens else []
    lemmas = split_str(row.get(col_lemmas, "")) if col_lemmas else []
    upos   = split_str(row.get(col_upos, "")) if col_upos else []
    deprel = split_str(row.get(col_deprel, "")) if col_deprel else []
    heads  = parse_heads(row.get(col_heads, "")) if col_heads else []

    n = max(len(lemmas), len(upos), len(deprel), len(heads), len(tokens))
    if n == 0:
        return {"n": 0, "tokens": [], "lemmas": [], "upos": [], "deprel": [], "heads": []}

    def pad(lst, fill):
        return (lst + [fill] * n)[:n]

    tokens = pad(tokens, "_")
    lemmas = pad(lemmas, "_")
    upos   = pad(upos, "_")
    deprel = pad(deprel, "_")
    heads  = (heads + [0] * n)[:n]

    return {
        "n": n,
        "tokens": tokens,
        "lemmas": lemmas,
        "upos": upos,
        "deprel": deprel,
        "heads": heads,  # 1-based heads, 0 = ROOT
    }


# ----------------------------
# Pattern extraction (from PolyLex rows)
# ----------------------------
def build_pattern(poly_row: pd.Series,
                  col_expr: str,
                  col_tokens: str,
                  col_lemmas: str,
                  col_upos: str,
                  col_deprel: str,
                  col_heads: str) -> Optional[Dict]:

    expr = str(poly_row.get(col_expr, "")).strip() if col_expr else ""
    ud = parse_ud_row(poly_row, col_tokens, col_lemmas, col_upos, col_deprel, col_heads)
    if not expr or ud["n"] == 0:
        return None

    # internal constraints: for each pattern node i with head h!=0 -> sentence head must match mapped(h) and deprel must match
    pat_head = {i: ud["heads"][i - 1] for i in range(1, ud["n"] + 1)}
    pat_depr = {i: ud["deprel"][i - 1] for i in range(1, ud["n"] + 1)}
    constraints = {}
    for i, h in pat_head.items():
        if h != 0:
            constraints[i] = (h, pat_depr[i])

    return {
        "expression": expr,
        "n": ud["n"],
        "lemmas": ud["lemmas"],
        "upos": ud["upos"],
        "constraints": constraints,  # child_i -> (head_i, deprel_of_child)
        "keys": set(zip(ud["lemmas"], ud["upos"])),  # for quick prefilter
    }


# ----------------------------
# Matching: strict lemma+upos + internal head/deprel arcs
# ----------------------------
def find_matches(pattern: Dict, sent_ud: Dict, max_matches_per_sent: int = 50) -> List[Dict]:
    pn, sn = pattern["n"], sent_ud["n"]
    if pn == 0 or sn == 0 or pn > sn:
        return []

    # sentence index: (lemma, upos) -> positions
    cand_by_key = defaultdict(list)
    for j in range(1, sn + 1):
        cand_by_key[(sent_ud["lemmas"][j - 1], sent_ud["upos"][j - 1])].append(j)

    # candidates for each pattern node
    candidates = {}
    for i in range(1, pn + 1):
        key = (pattern["lemmas"][i - 1], pattern["upos"][i - 1])
        candidates[i] = cand_by_key.get(key, [])
        if not candidates[i]:
            return []

    order = sorted(range(1, pn + 1), key=lambda i: len(candidates[i]))
    constraints = pattern["constraints"]

    used = set()
    mapping = {}
    matches = []

    def ok_partial(i: int) -> bool:
        # check constraint i -> head
        if i in constraints:
            h_pat, rel = constraints[i]
            if h_pat in mapping:
                j_child = mapping[i]
                j_head = mapping[h_pat]
                if sent_ud["heads"][j_child - 1] != j_head:
                    return False
                if sent_ud["deprel"][j_child - 1] != rel:
                    return False
        return True

    def backtrack(k: int):
        if len(matches) >= max_matches_per_sent:
            return
        if k == len(order):
            ids = sorted(mapping.values())
            matches.append({"token_ids_1based": ids, "span_minmax_1based": [ids[0], ids[-1]]})
            return

        i = order[k]
        for j in candidates[i]:
            if j in used:
                continue
            mapping[i] = j
            used.add(j)
            if ok_partial(i):
                backtrack(k + 1)
            used.remove(j)
            del mapping[i]

    backtrack(0)

    # dedup by token set
    seen = set()
    uniq = []
    for m in matches:
        key = tuple(m["token_ids_1based"])
        if key not in seen:
            seen.add(key)
            uniq.append(m)
    return uniq


# ----------------------------
# Main: project and aggregate BY EXPRESSION
# ----------------------------
def main():
    corpus_df = pd.read_excel(CORPUS_PATH)
    poly_df = pd.read_excel(POLY_PATH)

    # ---- Detect UD columns (adjust here if your names differ)
    # PolyLex columns
    poly_expr_col  = pick_first_existing(poly_df, ["expression", "expr", "mwe", "polylex"]) or pick_by_contains(poly_df, ["express", "mwe"])
    poly_tokens_col = pick_first_existing(poly_df, ["ud_tokens", "tokens"])
    poly_lemmas_col = pick_first_existing(poly_df, ["ud_lemmas", "lemmas", "ud_lemma"])
    poly_upos_col   = pick_first_existing(poly_df, ["ud_upos", "upos"])
    poly_deprel_col = pick_first_existing(poly_df, ["ud_deprel", "deprel"])
    poly_heads_col  = pick_first_existing(poly_df, ["ud_heads", "heads", "ud_head"])

    # Corpus columns
    corp_tokens_col = pick_first_existing(corpus_df, ["ud_tokens", "tokens"])
    corp_lemmas_col = pick_first_existing(corpus_df, ["ud_lemmas", "lemmas", "ud_lemma"])
    corp_upos_col   = pick_first_existing(corpus_df, ["ud_upos", "upos"])
    corp_deprel_col = pick_first_existing(corpus_df, ["ud_deprel", "deprel"])
    corp_heads_col  = pick_first_existing(corpus_df, ["ud_heads", "heads", "ud_head"])

    # Metadata columns in corpus: sentence text, niveau, source
    sent_text_col = (
        pick_first_existing(corpus_df, ["sentence", "sent", "text", "phrase"]) or
        pick_by_contains(corpus_df, ["sentence", "sent", "phrase", "texte"])
    )
    niveau_col = pick_first_existing(corpus_df, ["niveau", "level", "cecr", "cefr"]) or pick_by_contains(corpus_df, ["niveau", "cecr", "cefr", "level"])
    source_col = pick_first_existing(corpus_df, ["source", "src"]) or pick_by_contains(corpus_df, ["source", "src"])

    # Fallback: if sentence text column not found, reconstruct from ud_tokens
    if sent_text_col is None and corp_tokens_col is not None:
        sent_text_col = "__reconstructed_text__"
        corpus_df[sent_text_col] = corpus_df[corp_tokens_col].astype(str).fillna("").apply(lambda s: " ".join(split_str(s)))

    # ---- Build patterns
    patterns = []
    for _, r in tqdm(poly_df.iterrows(), total=len(poly_df), desc="Building patterns"):
        pat = build_pattern(r, poly_expr_col, poly_tokens_col, poly_lemmas_col, poly_upos_col, poly_deprel_col, poly_heads_col)
        if pat:
            patterns.append(pat)

    # Pre-store pattern keys for speed
    print(f"Patterns loaded: {len(patterns)}")

    # ---- Aggregate results: expression -> occurrences
    agg: Dict[str, List[Dict]] = defaultdict(list)

    for sent_idx, row in tqdm(corpus_df.iterrows(), total=len(corpus_df), desc="Projecting (aggregate by expression)"):
        sent_ud = parse_ud_row(row, corp_tokens_col, corp_lemmas_col, corp_upos_col, corp_deprel_col, corp_heads_col)
        if sent_ud["n"] == 0:
            continue

        sent_keys = set(zip(sent_ud["lemmas"], sent_ud["upos"]))

        sent_text = row.get(sent_text_col, "") if sent_text_col else ""
        niveau = row.get(niveau_col, "") if niveau_col else ""
        source = row.get(source_col, "") if source_col else ""

        for pat in patterns:
            # quick prefilter
            if not pat["keys"].issubset(sent_keys):
                continue

            matches = find_matches(pat, sent_ud)
            if not matches:
                continue

            # store one record per match
            for m in matches:
                agg[pat["expression"]].append({
                    "sent_id": int(sent_idx),
                    "sentence": str(sent_text),
                    "niveau": "" if (isinstance(niveau, float) and pd.isna(niveau)) else str(niveau),
                    "source": "" if (isinstance(source, float) and pd.isna(source)) else str(source),
                    "token_ids_1based": m["token_ids_1based"],
                    "span_minmax_1based": m["span_minmax_1based"],
                })

    # ---- Build output table (one row per expression)
    rows = []
    for expr, occs in agg.items():
        rows.append({
            "expression": expr,
            "occurrences_count": len(occs),
            # full list as JSON
            "occurrences": json.dumps(occs, ensure_ascii=False),
            # quick preview: up to first 5 sentences
            "examples_5": " ||| ".join([o["sentence"] for o in occs[:5]]),
        })

    out_df = pd.DataFrame(rows).sort_values("occurrences_count", ascending=False)
    out_df.to_excel(OUT_PATH, index=False)
    print(f"Saved: {OUT_PATH}")


if __name__ == "__main__":
    main()


Building patterns: 100%|██████████| 2260/2260 [00:00<00:00, 13050.30it/s]


Patterns loaded: 2260


Projecting (aggregate by expression): 100%|██████████| 49263/49263 [00:22<00:00, 2220.19it/s]


Saved: projected_mwes_by_expression.xlsx


In [None]:
import json
import pandas as pd
from tqdm import tqdm

IN_PATH = "projected_mwes_by_expression.xlsx"
OUT_PATH = "projected_mwes_occurrence_level.xlsx"

df = pd.read_excel(IN_PATH)

rows = []

for _, r in tqdm(df.iterrows(), total=len(df)):
    expr = r.get("expression", "")
    occ_raw = r.get("occurrences", "")

    if pd.isna(occ_raw) or not str(occ_raw).strip():
        continue

    occ_raw = str(occ_raw)

    # попытка 1: обычный JSON
    try:
        occ_list = json.loads(occ_raw)
    except Exception:
        # попытка 2: вытащить хотя бы объекты вручную
        print("⚠️ JSON error for expression:", expr)
        continue

    for occ in occ_list:
        rows.append({
            "expression": expr,
            "sent_id": occ.get("sent_id"),
            "sentence": occ.get("sentence"),
            "niveau": occ.get("niveau"),
            "source": occ.get("source"),
            "token_ids_1based": " ".join(map(str, occ.get("token_ids_1based", []))),
            "span_minmax_1based": " ".join(map(str, occ.get("span_minmax_1based", []))),
        })

out_df = pd.DataFrame(rows)
out_df.to_excel(OUT_PATH, index=False)

print("Saved:", OUT_PATH)


# (опционально) привести пустые строки/NaN
out_df["niveau"] = out_df["niveau"].fillna("")
out_df["source"] = out_df["source"].fillna("")

out_df.to_excel(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")

pivot_niveau = out_df.pivot_table(
    index="expression",
    columns="niveau",
    values="sent_id",
    aggfunc="count",
    fill_value=0
)
pivot_niveau.to_excel("mwe_by_niveau_pivot.xlsx")

pivot_source = out_df.pivot_table(
    index="expression",
    columns="source",
    values="sent_id",
    aggfunc="count",
    fill_value=0
)
pivot_source.to_excel("mwe_by_source_pivot.xlsx")


100%|██████████| 1758/1758 [00:00<00:00, 10772.19it/s]


⚠️ JSON error for expression: être à même de
⚠️ JSON error for expression: être d'accord avec
⚠️ JSON error for expression: faire ça
⚠️ JSON error for expression: faire partie
⚠️ JSON error for expression: vouloir bien
⚠️ JSON error for expression: famille de quatre enfants
⚠️ JSON error for expression: avoir un poil dans la main
⚠️ JSON error for expression: avoir le cœur sur la main
⚠️ JSON error for expression: ne pas avoir froid aux yeux
⚠️ JSON error for expression: avoir la tête sur les épaules
⚠️ JSON error for expression: avoir la dent dure
⚠️ JSON error for expression: faire la tête
⚠️ JSON error for expression: avoir le bras long
Saved: projected_mwes_occurrence_level.xlsx
Saved: projected_mwes_occurrence_level.xlsx


In [None]:
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
from typing import Optional, List, Dict

CORPUS_PATH = "corpus_sent_level_with_ud_stanza.xlsx"
POLY_PATTERNS_PATH = "polylex_with_ud_stanza.xlsx"

# Si tu as un fichier polylex_maj distinct, mets-le ici.
# Sinon laisse None : le script cherchera resolved_annotation dans POLY_PATTERNS_PATH.
POLY_MAJ_PATH = None  # ex: "/mnt/data/polylex_maj.xlsx"

OUT_WIDE_PATH = "mwe_occurrences_wide.xlsx"
OUT_PIVOT_PATH = "mwe_par_niveau_pivot.xlsx"

# Limites pratiques (éviter de casser Excel)
MAX_OCC_COLS = 500        # max colonnes occ_1..occ_N par expression
MAX_SENT_CHARS = 250      # tronquer la phrase dans occ_k pour rester lisible (et éviter 32767 chars)


# ----------------------------
# Utils
# ----------------------------
def split_str(s) -> List[str]:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return []
    s = str(s).strip()
    return s.split() if s else []

def parse_heads(s) -> List[int]:
    return [int(x) for x in split_str(s)]

def pick_first_existing(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    lower_map = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in lower_map:
            return lower_map[cand.lower()]
    return None

def pick_by_contains(df: pd.DataFrame, substrings: List[str]) -> Optional[str]:
    for c in df.columns:
        cl = c.lower()
        if any(s in cl for s in substrings):
            return c
    return None

def safe_text(x) -> str:
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return ""
    return str(x)

def trim(s: str, n: int) -> str:
    s = safe_text(s)
    return (s[:n] + "…") if len(s) > n else s


# ----------------------------
# UD parsing (expects whitespace-joined sequences)
# ----------------------------
def parse_ud_row(row: pd.Series,
                 col_tokens: str,
                 col_lemmas: str,
                 col_upos: str,
                 col_deprel: str,
                 col_heads: str) -> Dict:
    tokens = split_str(row.get(col_tokens, "")) if col_tokens else []
    lemmas = split_str(row.get(col_lemmas, "")) if col_lemmas else []
    upos   = split_str(row.get(col_upos, "")) if col_upos else []
    deprel = split_str(row.get(col_deprel, "")) if col_deprel else []
    heads  = parse_heads(row.get(col_heads, "")) if col_heads else []

    n = max(len(tokens), len(lemmas), len(upos), len(deprel), len(heads))
    if n == 0:
        return {"n": 0, "tokens": [], "lemmas": [], "upos": [], "deprel": [], "heads": []}

    def pad(lst, fill):
        return (lst + [fill] * n)[:n]

    tokens = pad(tokens, "_")
    lemmas = pad(lemmas, "_")
    upos   = pad(upos, "_")
    deprel = pad(deprel, "_")
    heads  = (heads + [0] * n)[:n]

    return {"n": n, "tokens": tokens, "lemmas": lemmas, "upos": upos, "deprel": deprel, "heads": heads}


# ----------------------------
# Pattern extraction (strict lemma+upos + internal head/deprel)
# ----------------------------
def build_pattern(poly_row: pd.Series,
                  col_expr: str,
                  col_tokens: str,
                  col_lemmas: str,
                  col_upos: str,
                  col_deprel: str,
                  col_heads: str) -> Optional[Dict]:
    expr = safe_text(poly_row.get(col_expr, "")).strip() if col_expr else ""
    ud = parse_ud_row(poly_row, col_tokens, col_lemmas, col_upos, col_deprel, col_heads)
    if not expr or ud["n"] == 0:
        return None

    # constraints: for each pattern node i with head h != 0, sentence head must match mapped(h), and deprel must match
    constraints = {}
    for i in range(1, ud["n"] + 1):
        h = ud["heads"][i - 1]
        rel = ud["deprel"][i - 1]
        if h != 0:
            constraints[i] = (h, rel)

    keys = set(zip(ud["lemmas"], ud["upos"]))

    return {
        "expression": expr,
        "n": ud["n"],
        "lemmas": ud["lemmas"],
        "upos": ud["upos"],
        "constraints": constraints,
        "keys": keys
    }


def find_matches(pattern: Dict, sent_ud: Dict, max_matches_per_sent: int = 50) -> List[List[int]]:
    pn, sn = pattern["n"], sent_ud["n"]
    if pn == 0 or sn == 0 or pn > sn:
        return []

    # sentence index: (lemma, upos) -> positions
    cand_by_key = defaultdict(list)
    for j in range(1, sn + 1):
        cand_by_key[(sent_ud["lemmas"][j - 1], sent_ud["upos"][j - 1])].append(j)

    # candidates per pattern node
    candidates = {}
    for i in range(1, pn + 1):
        key = (pattern["lemmas"][i - 1], pattern["upos"][i - 1])
        cands = cand_by_key.get(key, [])
        if not cands:
            return []
        candidates[i] = cands

    order = sorted(range(1, pn + 1), key=lambda i: len(candidates[i]))
    constraints = pattern["constraints"]

    used = set()
    mapping = {}
    matches = []

    def ok_partial(i: int) -> bool:
        if i in constraints:
            h_pat, rel = constraints[i]
            if h_pat in mapping:
                j_child = mapping[i]
                j_head = mapping[h_pat]
                if sent_ud["heads"][j_child - 1] != j_head:
                    return False
                if sent_ud["deprel"][j_child - 1] != rel:
                    return False
        return True

    def backtrack(k: int):
        if len(matches) >= max_matches_per_sent:
            return
        if k == len(order):
            ids = sorted(mapping.values())
            matches.append(ids)
            return
        i = order[k]
        for j in candidates[i]:
            if j in used:
                continue
            mapping[i] = j
            used.add(j)
            if ok_partial(i):
                backtrack(k + 1)
            used.remove(j)
            del mapping[i]

    backtrack(0)

    # dedup
    seen = set()
    uniq = []
    for ids in matches:
        t = tuple(ids)
        if t not in seen:
            seen.add(t)
            uniq.append(ids)
    return uniq


# ----------------------------
# Main
# ----------------------------
def main():
    corpus_df = pd.read_excel(CORPUS_PATH)
    poly_df = pd.read_excel(POLY_PATTERNS_PATH)

    # --- Column detection (UD columns)
    poly_expr_col  = pick_first_existing(poly_df, ["expression", "expr", "mwe"]) or pick_by_contains(poly_df, ["express", "mwe"])
    poly_tokens_col = pick_first_existing(poly_df, ["ud_tokens", "tokens"])
    poly_lemmas_col = pick_first_existing(poly_df, ["ud_lemmas", "lemmas", "ud_lemma"])
    poly_upos_col   = pick_first_existing(poly_df, ["ud_upos", "upos"])
    poly_deprel_col = pick_first_existing(poly_df, ["ud_deprel", "deprel"])
    poly_heads_col  = pick_first_existing(poly_df, ["ud_heads", "heads", "ud_head"])

    corp_tokens_col = pick_first_existing(corpus_df, ["ud_tokens", "tokens"])
    corp_lemmas_col = pick_first_existing(corpus_df, ["ud_lemmas", "lemmas", "ud_lemma"])
    corp_upos_col   = pick_first_existing(corpus_df, ["ud_upos", "upos"])
    corp_deprel_col = pick_first_existing(corpus_df, ["ud_deprel", "deprel"])
    corp_heads_col  = pick_first_existing(corpus_df, ["ud_heads", "heads", "ud_head"])

    # metadata columns in corpus
    sent_text_col = (
        pick_first_existing(corpus_df, ["sentence", "sent", "text", "phrase"]) or
        pick_by_contains(corpus_df, ["sentence", "sent", "phrase", "texte"])
    )
    niveau_col = pick_first_existing(corpus_df, ["niveau", "level", "cecr", "cefr"]) or pick_by_contains(corpus_df, ["niveau", "cecr", "cefr", "level"])
    source_col = pick_first_existing(corpus_df, ["source", "src"]) or pick_by_contains(corpus_df, ["source", "src"])

    if sent_text_col is None and corp_tokens_col is not None:
        sent_text_col = "__reconstructed_text__"
        corpus_df[sent_text_col] = corpus_df[corp_tokens_col].astype(str).fillna("").apply(lambda s: " ".join(split_str(s)))

    # --- resolved_annotation mapping (from polylex_maj if provided, else from patterns file)
    if POLY_MAJ_PATH:
        poly_maj = pd.read_excel(POLY_MAJ_PATH)
    else:
        poly_maj = poly_df

    ann_expr_col = pick_first_existing(poly_maj, ["expression", "expr", "mwe"]) or pick_by_contains(poly_maj, ["express", "mwe"])
    ann_col = pick_first_existing(poly_maj, ["annotation_resolved"]) or pick_by_contains(poly_maj, ["resolved", "annotation"])

    if ann_expr_col is None or ann_col is None:
        # fallback: create empty annotation
        expr2ann = defaultdict(str)
    else:
        expr2ann = dict(
            (safe_text(r[ann_expr_col]).strip(), safe_text(r[ann_col]).strip())
            for _, r in poly_maj[[ann_expr_col, ann_col]].dropna().iterrows()
        )

    # --- Build patterns
    patterns = []
    for _, r in tqdm(poly_df.iterrows(), total=len(poly_df), desc="Building patterns"):
        pat = build_pattern(r, poly_expr_col, poly_tokens_col, poly_lemmas_col, poly_upos_col, poly_deprel_col, poly_heads_col)
        if pat:
            patterns.append(pat)

    print(f"Patterns loaded: {len(patterns)}")

    # --- Occurrence-level collection (SAFE, no JSON-in-cell)
    occ_rows = []  # one row per occurrence
    for sent_idx, row in tqdm(corpus_df.iterrows(), total=len(corpus_df), desc="Projecting occurrences"):
        sent_ud = parse_ud_row(row, corp_tokens_col, corp_lemmas_col, corp_upos_col, corp_deprel_col, corp_heads_col)
        if sent_ud["n"] == 0:
            continue

        sent_keys = set(zip(sent_ud["lemmas"], sent_ud["upos"]))
        sent_text = safe_text(row.get(sent_text_col, ""))
        niveau = safe_text(row.get(niveau_col, "")) if niveau_col else ""
        source = safe_text(row.get(source_col, "")) if source_col else ""

        for pat in patterns:
            if not pat["keys"].issubset(sent_keys):
                continue
            matches = find_matches(pat, sent_ud)
            if not matches:
                continue

            for ids in matches:
                occ_rows.append({
                    "expression": pat["expression"],
                    "sent_id": int(sent_idx),
                    "sentence": sent_text,
                    "niveau": niveau,
                    "source": source,
                    "token_ids_1based": " ".join(map(str, ids)),
                })

    occ_df = pd.DataFrame(occ_rows)

    # --- Pivot by niveau (counts)
    pivot = occ_df.pivot_table(
        index="expression",
        columns="niveau",
        values="sent_id",
        aggfunc="count",
        fill_value=0
    ).reset_index()

    # --- Wide table: expression + annotation + occurrence_count + occ_1..occ_N
    # Build occurrence string per row
    def occ_to_cell(r):
        # format: [niveau] [source] sentence
        return f"[{r['niveau']}] [{r['source']}] {trim(r['sentence'], MAX_SENT_CHARS)}"

    occ_df["occ_cell"] = occ_df.apply(occ_to_cell, axis=1)

    # group occurrences
    grouped = occ_df.groupby("expression", sort=False)["occ_cell"].apply(list).reset_index()
    grouped["occurrence_count"] = grouped["occ_cell"].apply(len)
    grouped["annotation_resolved"] = grouped["expression"].map(lambda e: expr2ann.get(e, ""))

    # Expand to columns occ_1..occ_N (capped)
    max_len = int(min(MAX_OCC_COLS, grouped["occ_cell"].apply(len).max() if len(grouped) else 0))

    wide = grouped[["expression", "annotation_resolved", "occurrence_count"]].copy()
    for k in range(1, max_len + 1):
        wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")

    # --- Merge pivot with annotation + occurrence_count
    # Ensure we carry annotation & count in pivot too
    pivot = pivot.merge(
        wide[["expression", "annotation_resolved", "occurrence_count"]],
        on="expression",
        how="left"
    )
    # reorder: expression, resolved_annotation, occurrence_count, then niveaux...
    niveau_cols = [c for c in pivot.columns if c not in ["expression", "annotation_resolved", "occurrence_count"]]
    pivot = pivot[["expression", "annotation_resolved", "occurrence_count"] + sorted(niveau_cols)]

    # Save
    wide.to_excel(OUT_WIDE_PATH, index=False)
    pivot.to_excel(OUT_PIVOT_PATH, index=False)
    print("Saved wide:", OUT_WIDE_PATH)
    print("Saved pivot:", OUT_PIVOT_PATH)
    print("Note: occ columns capped at", MAX_OCC_COLS, "and sentence trimmed to", MAX_SENT_CHARS, "chars.")

if __name__ == "__main__":
    main()


Building patterns: 100%|██████████| 2260/2260 [00:00<00:00, 13235.94it/s]


Patterns loaded: 2260


Projecting occurrences: 100%|██████████| 49263/49263 [00:22<00:00, 2190.89it/s]
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len(lst) >= k else "")
  wide[f"occ_{k}"] = grouped["occ_cell"].apply(lambda lst: lst[k-1] if len

Saved wide: mwe_occurrences_wide.xlsx
Saved pivot: mwe_par_niveau_pivot.xlsx
Note: occ columns capped at 500 and sentence trimmed to 250 chars.
