In [None]:
import pandas as pd
import re
from tqdm import tqdm

tqdm.pandas()

OCC_PATH = "mwe_occurrences_wide_restored_metadata_v4.xlsx"
CORPUS_PATH = "corpus_with_mwes_by_level.xlsx"
OUT_PATH = "corpus_projected_with_types.xlsx"

occ = pd.read_excel(OCC_PATH)
corpus = pd.read_excel(CORPUS_PATH)

# ---------------- helpers ----------------
def normalize(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return re.sub(r"\s+", " ", s.strip().lower())

def parse_occ(cell):
    """
    Format attendu: [niveau] [source] phrase
    Renvoie: (source, phrase) ou None
    """
    if not isinstance(cell, str) or not cell.strip():
        return None
    parts = cell.split("]")
    if len(parts) < 3:
        return None
    source = parts[1].replace("[", "").strip()
    sent = "]".join(parts[2:]).strip()
    return source, sent

def ann_type(s):
    # type = annotation_resolved avant la première parenthèse
    if not isinstance(s, str):
        return ""
    return s.split("(")[0].strip()

# ---------------- find columns ----------------
# sentence column in corpus
sent_col = next(
    (c for c in corpus.columns if c.lower() in ["sentence", "sent", "phrase", "text", "texte"]),
    None
)
if sent_col is None:
    raise ValueError("Je ne trouve pas la colonne sentence/phrase/text dans le corpus.")

# source column in corpus (optionnel)
source_col = next((c for c in corpus.columns if "source" in c.lower()), None)

# sent_id: keep if exists, else create stable sequential id
sent_id_col = next((c for c in corpus.columns if c.lower() in ["sent_id", "sentence_id", "id_sent", "id"]), None)
if sent_id_col is None:
    sent_id_col = "sent_id"
    corpus.insert(0, sent_id_col, range(len(corpus)))  # 0..N-1, stable

# occ columns and required columns in occ file
occ_cols = [c for c in occ.columns if re.fullmatch(r"occ_\d+", str(c))]
expr_col = next((c for c in occ.columns if c.lower() in ["expression", "expr", "mwe"]), "expression")
ann_col = next((c for c in occ.columns if c.lower() == "annotation_resolved"), None)
if ann_col is None:
    raise ValueError("Je ne trouve pas la colonne annotation_resolved dans le fichier occurrences.")

# ---------------- build corpus index (by normalized sentence) ----------------
print("Normalisation des phrases du corpus...")
corpus["_norm"] = corpus[sent_col].progress_apply(normalize)

print("Indexation du corpus...")
index = {}  # norm_sentence -> list of corpus row indices
for i, norm in tqdm(enumerate(corpus["_norm"].tolist()), total=len(corpus), desc="Indexing corpus"):
    index.setdefault(norm, []).append(i)

# ---------------- collect matches from occurrences ----------------
print("Lecture des occurrences (wide -> matches)...")
# We'll accumulate on corpus row position i (0..N-1)
proj = [dict() for _ in range(len(corpus))]  # each is {expr: type}

for _, r in tqdm(occ.iterrows(), total=len(occ), desc="Scanning expressions"):
    expr = str(r.get(expr_col, "")).strip()
    typ = ann_type(r.get(ann_col, ""))

    if not expr:
        continue

    for c in occ_cols:
        parsed = parse_occ(r.get(c, ""))
        if not parsed:
            continue

        occ_source, occ_sent = parsed
        key = normalize(occ_sent)
        if not key:
            continue

        # 1) try exact sentence match
        hit_rows = index.get(key, [])

        # 2) (optionnel) если exact sentence не найдено, можно попробовать fallback по source+подстроке,
        # но ты просила: "чаще всего source, если нет — по тексту".
        # Чтобы не ловить ложные совпадения, fallback делаем очень осторожно:
        if not hit_rows:
            # fallback: искать по подстроке (первые 120 символов) + source если есть
            needle = normalize(occ_sent)[:120]
            if needle:
                if source_col and isinstance(occ_source, str) and occ_source.strip():
                    mask = (
                        corpus[source_col].astype(str).str.contains(re.escape(occ_source), na=False)
                        & corpus["_norm"].str.contains(re.escape(needle), na=False)
                    )
                else:
                    mask = corpus["_norm"].str.contains(re.escape(needle), na=False)
                hit_rows = list(corpus.index[mask])

        for i in hit_rows:
            # store expr -> type (keep first type if multiple)
            if expr not in proj[i]:
                proj[i][expr] = typ

# ---------------- build MWEs column: "expr (type)" ----------------
def format_expr_type(d: dict) -> str:
    if not d:
        return ""
    # stable alphabetical order
    items = [f"{e} ({t})" if t else f"{e}" for e, t in sorted(d.items(), key=lambda x: x[0])]
    return " | ".join(items)

mwes_col_name = "MWEs_projected"
corpus[mwes_col_name] = [format_expr_type(d) for d in proj]

# ---------------- place MWEs column right after sent_id ----------------
cols = list(corpus.columns)
# remove temp _norm
cols.remove("_norm")
corpus = corpus.drop(columns=["_norm"])

cols = list(corpus.columns)
# move MWEs_projected right after sent_id_col
cols.remove(mwes_col_name)
sent_pos = cols.index(sent_id_col)
cols.insert(sent_pos + 1, mwes_col_name)
corpus = corpus[cols]

# ---------------- save without changing order ----------------
corpus.to_excel(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Normalisation des phrases du corpus...


100%|██████████| 49255/49255 [00:00<00:00, 115568.87it/s]


Indexation du corpus...


Indexing corpus: 100%|██████████| 49255/49255 [00:00<00:00, 674268.40it/s]


Lecture des occurrences (wide -> matches)...


Scanning expressions: 100%|██████████| 1881/1881 [00:46<00:00, 40.50it/s]


Saved: corpus_projected_with_types.xlsx


In [None]:
import pandas as pd

IN_PATH = "corpus_projected_with_types.xlsx"
OUT_PATH = "corpus_projected_with_types_sorted.xlsx"

df = pd.read_excel(IN_PATH)

# 1) Найти колонку с sentence_id
sid_col = None
for c in df.columns:
    if c.lower() in ["sentence_id", "sent_id", "id_sentence", "id_sent"]:
        sid_col = c
        break
if sid_col is None:
    raise ValueError("Не нашла колонку sentence_id / sent_id.")

#.any_source_file: выбираем колонку документа (если есть)
doc_col = None
for c in df.columns:
    cl = c.lower()
    if cl == "source_file" or cl == "source" or "source" in cl or "document" in cl:
        doc_col = c
        break

# 2) Сортируем СТАБИЛЬНО (чтобы одинаковые id не перемешались случайно)
# Если есть doc_col, сортируем сначала по документу, потом по sentence_id.
if doc_col:
    df_sorted = df.sort_values([doc_col, sid_col], kind="stable")
else:
    df_sorted = df.sort_values([sid_col], kind="stable")

df_sorted.to_excel(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Saved: corpus_projected_with_types_sorted.xlsx


In [None]:
import pandas as pd
import re
from tqdm import tqdm

OCC_IN = "mwe_occurrences_wide_cleaned_rightmost_level.xlsx"
CORPUS = "corpus_with_mwes_by_level.xlsx"
OCC_OUT = "mwe_occurrences_wide_restored_metadata_v3.xlsx"

occ = pd.read_excel(OCC_IN)
corpus = pd.read_excel(CORPUS)

# --- corpus columns ---
sent_col = next(c for c in corpus.columns
                if c.lower() in ["sentence","sent","phrase","text","texte"])
source_col = next((c for c in corpus.columns if "source" in c.lower()), None)
niveau_col = next((c for c in corpus.columns if "niveau" in c.lower()), None)

# --- normalize ---
def norm(s):
    if not isinstance(s, str):
        return ""
    return re.sub(r"\s+", " ", s.strip().lower())

# --- detect [] blocks at start ---
two_blocks = re.compile(r"^\s*\[[^\]]*\]\s*\[[^\]]*\]")
one_block = re.compile(r"^\s*\[[^\]]*\]")

def strip_one_block(s):
    return re.sub(one_block, "", s, count=1).strip()

# --- index corpus ---
corpus["_norm"] = corpus[sent_col].astype(str).map(norm)

index = {}
for i, k in enumerate(corpus["_norm"]):
    index.setdefault(k, []).append(i)

occ_cols = [c for c in occ.columns if re.fullmatch(r"occ_\d+", str(c))]

new_occ = occ.copy()

for r_i, row in tqdm(occ.iterrows(), total=len(occ), desc="Restoring"):

    for c in occ_cols:
        cell = row.get(c, "")

        if not isinstance(cell, str) or not cell.strip():
            continue

        s = cell.strip()

        # ✅ already good: has two [] []
        if two_blocks.match(s):
            continue

        # ⚠ only one [] -> strip and search
        if one_block.match(s):

            quote = strip_one_block(s)
            key = norm(quote)

            hits = index.get(key, [])

            # fallback: substring search
            if not hits and key:
                needle = key[:120]
                mask = corpus["_norm"].str.contains(re.escape(needle), na=False)
                hits = list(corpus.index[mask])

            if not hits:
                continue

            i0 = hits[0]
            src = str(corpus.loc[i0, source_col]) if source_col else ""
            lvl = str(corpus.loc[i0, niveau_col]) if niveau_col else ""

            new_occ.at[r_i, c] = f"[{lvl}] [{src}] {quote}"

new_occ.drop(columns="_norm", errors="ignore").to_excel(OCC_OUT, index=False)

print("Saved:", OCC_OUT)


Restoring: 100%|██████████| 1881/1881 [00:01<00:00, 1037.28it/s]


Saved: mwe_occurrences_wide_restored_metadata_v3.xlsx
