In [2]:
import pandas as pd
import stanza
from tqdm import tqdm

INPUT_XLSX = "polylex_maj_16.01.-1.xlsx"
OUTPUT_XLSX = "polylex_with_ud_stanza.xlsx"
TEXT_COL = "expression"   # поменяй, если нужно

# ====== INIT STANZA ======
# Один раз:
# stanza.download("fr", processors="tokenize,pos,lemma,depparse")

nlp = stanza.Pipeline(
    lang="fr",
    processors="tokenize,pos,lemma,depparse",
    tokenize_no_ssplit=True,
    verbose=False
)

def normalize_dep_pattern(sent):
    words = sent.words
    root = next((w for w in words if w.deprel == "root"), None)

    arcs = []
    case_preps = []
    for w in words:
        if w.deprel == "root":
            continue
        head = words[w.head - 1] if w.head > 0 else None
        if not head:
            continue
        arcs.append(f"{w.deprel}({head.upos}->{w.upos})")
        if w.deprel == "case" and w.text:
            case_preps.append(w.text.lower())

    arcs_sorted = " ; ".join(sorted(arcs))
    prep_info = ""
    if case_preps:
        prep_info = f" | case_preps={','.join(sorted(set(case_preps)))}"

    head_info = f"HEAD={root.upos}" if root else "HEAD=?"
    return head_info + " | " + arcs_sorted + prep_info

def annotate_one(text):
    text = str(text).strip()
    if not text:
        return {
            "ud_tokens": "",
            "ud_lemmas": "",
            "ud_upos": "",
            "ud_xpos": "",
            "ud_feats": "",
            "ud_deprel": "",
            "ud_heads": "",
            "ud_dep_arcs": "",
            "ud_pos_pattern": "",
            "ud_dep_pattern": ""
        }

    doc = nlp(text)
    sent = doc.sentences[0]

    return {
        "ud_tokens": " ".join(w.text for w in sent.words),
        "ud_lemmas": " ".join(w.lemma or "" for w in sent.words),
        "ud_upos": " ".join(w.upos for w in sent.words),
        "ud_xpos": " ".join(w.xpos or "" for w in sent.words),
        "ud_feats": " | ".join(w.feats or "" for w in sent.words),
        "ud_deprel": " ".join(w.deprel for w in sent.words),
        "ud_heads": " ".join(str(w.head) for w in sent.words),
        "ud_dep_arcs": " ; ".join(
            f"{w.text}<-{w.deprel}-({sent.words[w.head-1].text if w.head>0 else 'ROOT'})"
            for w in sent.words
        ),
        "ud_pos_pattern": " ".join(w.upos for w in sent.words),
        "ud_dep_pattern": normalize_dep_pattern(sent),
    }

# ====== RUN ======
df = pd.read_excel(INPUT_XLSX)

results = []
for text in tqdm(df[TEXT_COL], desc="Annotating expressions", unit="expr"):
    results.append(annotate_one(text))

ann_df = pd.DataFrame(results)
out = pd.concat([df, ann_df], axis=1)
out.to_excel(OUTPUT_XLSX, index=False)

print("Saved:", OUTPUT_XLSX)


Annotating expressions: 100%|██████████| 2268/2268 [05:26<00:00,  6.95expr/s]


Saved: polylex_with_ud_stanza.xlsx
