In [None]:
import zipfile
import csv
import re

ZIP_PATH = "corpus.zip"
OUT = "corpus_sent_level.csv"

sent_split = re.compile(r'(?<=[.!?…])\s+')

def iter_sentences_streaming(text):
    buffer = ""
    for line in text.splitlines():
        buffer += " " + line.strip()
        parts = sent_split.split(buffer)
        for s in parts[:-1]:
            s = s.strip()
            if s:
                yield s
        buffer = parts[-1]
    tail = buffer.strip()
    if tail:
        yield tail

with zipfile.ZipFile(ZIP_PATH, "r") as z, \
     open(OUT, "w", encoding="utf-8", newline="") as out:

    writer = csv.writer(out)
    writer.writerow(["sentence", "level", "source_file", "sent_id"])

    for name in z.namelist():
        if not name.endswith(".txt"):
            continue

        # name = "A1/odyssee_a1/01.txt"
        parts = name.split("/")
        level = parts[0]  # A1, B2, C1...

        with z.open(name) as f:
            text = f.read().decode("utf-8", errors="ignore")
            for i, sent in enumerate(iter_sentences_streaming(text)):
                writer.writerow([sent, level, name, i])


In [None]:
import zipfile

ZIP_PATH = "corpus.zip"
OUT = "all_texts.txt"

with zipfile.ZipFile(ZIP_PATH, "r") as z, \
     open(OUT, "w", encoding="utf-8") as out:

    for name in z.namelist():
        if name.endswith(".txt"):
            with z.open(name) as f:
                text = f.read().decode("utf-8", errors="ignore")
                out.write(text)
                out.write("\n\n")

In [None]:
# -*- coding: utf-8 -*-
"""
Annotate corpus_sent_level.csv with UD fields using Stanza,
matching the annotation style in polylex_with_ud_stanza.xlsx.

Output: corpus_sent_level_with_ud_stanza.xlsx
"""
!pip install stanza
import pandas as pd
import stanza
from tqdm.auto import tqdm

IN_CSV = "corpus_sent_level.csv"
OUT_XLSX = "corpus_sent_level_with_ud_stanza.xlsx"

# --- 1) Stanza models (download once) ---
stanza.download("fr")

# --- 2) Pipeline ---
nlp = stanza.Pipeline(
    lang="fr",
    processors="tokenize,pos,lemma,depparse",
    tokenize_no_ssplit=True,  # sentences already provided row-wise
    use_gpu=False
)

UD_COLS = [
    "ud_tokens",
    "ud_lemmas",
    "ud_upos",
    "ud_feats",
    "ud_deprel",
    "ud_heads",
    "ud_dep_arcs",
    "ud_pos_pattern",
    "ud_dep_pattern",
]


def serialize_doc_like_xlsx(doc):
    """Serialize stanza doc (single sentence) into columns like polylex_with_ud_stanza.xlsx."""
    if not doc.sentences:
        return {k: "" for k in UD_COLS}

    sent = doc.sentences[0]
    words = sent.words  # UD-like word objects

    tokens = [w.text if w.text is not None else "" for w in words]
    lemmas = [w.lemma if w.lemma is not None else "" for w in words]
    upos = [w.upos if w.upos is not None else "" for w in words]
    feats = [w.feats if w.feats is not None else "" for w in words]
    deprel = [w.deprel if w.deprel is not None else "" for w in words]
    heads = [int(w.head) if w.head is not None else 0 for w in words]

    def head_text(head_id: int) -> str:
        if head_id == 0:
            return "ROOT"
        return tokens[head_id - 1] if 1 <= head_id <= len(tokens) else "ROOT"

    def head_upos(head_id: int) -> str:
        if head_id == 0:
            return "ROOT"
        return upos[head_id - 1] if 1 <= head_id <= len(upos) else "ROOT"

    # ud_dep_arcs: "tok<-deprel-(headtok) ; ..."
    arcs = []
    for i, tok in enumerate(tokens, start=1):
        arcs.append(f"{tok}<-{deprel[i-1]}-({head_text(heads[i-1])})")
    ud_dep_arcs = " ; ".join(arcs)

    # root UPOS (first token with head=0)
    root_idx = next((i for i, h in enumerate(heads) if h == 0), None)
    root_upos = upos[root_idx] if root_idx is not None else ""

    # ud_dep_pattern:
    # HEAD=<root_upos> | rel(UPOS_HEAD->UPOS_DEP) ; ...  [sorted by rel]
    # optionally: | case_preps=de,à,...
    dep_items = []
    case_prep_tokens = []
    seen_case = set()

    for i in range(len(words)):
        if heads[i] == 0:
            continue
        rel = deprel[i] or ""
        h_up = head_upos(heads[i]) or ""
        d_up = upos[i] or ""
        dep_items.append((rel, h_up, d_up))

        if rel == "case":
            t = tokens[i]
            if t not in seen_case:
                seen_case.add(t)
                case_prep_tokens.append(t)

    dep_items.sort(key=lambda x: (x[0], x[1], x[2]))
    dep_parts = [f"{rel}({h}->{d})" for rel, h, d in dep_items]

    ud_dep_pattern = f"HEAD={root_upos}"
    if dep_parts:
        ud_dep_pattern += " | " + " ; ".join(dep_parts)
    if case_prep_tokens:
        ud_dep_pattern += " | case_preps=" + ",".join(case_prep_tokens)

    return {
        "ud_tokens": " ".join(tokens),
        "ud_lemmas": " ".join(lemmas),
        "ud_upos": " ".join(upos),
        "ud_feats": " | ".join(feats),
        "ud_deprel": " ".join(deprel),
        "ud_heads": " ".join(str(h) for h in heads),
        "ud_dep_arcs": ud_dep_arcs,
        "ud_pos_pattern": " ".join(upos),
        "ud_dep_pattern": ud_dep_pattern,
    }


def annotate_text(text: str):
    text = "" if pd.isna(text) else str(text).strip()
    if not text:
        return {k: "" for k in UD_COLS}
    doc = nlp(text)
    return serialize_doc_like_xlsx(doc)


def main():
    df = pd.read_csv(IN_CSV)

    if "sentence" not in df.columns:
        raise ValueError(
            f"CSV must contain a 'sentence' column. Found: {list(df.columns)}"
        )

    tqdm.pandas()
    ud_series = df["sentence"].progress_apply(annotate_text)
    ud_df = ud_series.apply(pd.Series)

    df_out = pd.concat([df, ud_df], axis=1)
    df_out.to_excel(OUT_XLSX, index=False)
    print("Saved:", OUT_XLSX)


if __name__ == "__main__":
    main()




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: fr (French) ...
INFO:stanza:File exists: /root/stanza_resources/fr/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


  0%|          | 0/49277 [00:00<?, ?it/s]

Saved: corpus_sent_level_with_ud_stanza.xlsx
