In [None]:
pip install transformers




In [None]:
pip install spacy



In [None]:
pip install stanza



Ce script est pour découper les phrases du corpus avec trois tokeniseurs. SpaCy, qui sépare les phrases en mots en utilisant des règles de la langue. Le deuxième, Stanza, fait la même chose mais avec un modèle entraîné spécialement pour le français. Le troisième, BERT, est un peu différent : dans notre cas, il garde toute la phrase comme un seul morceau et ne coupe pas en mots, après les résultats  sont enregistrés dans deux fichiers (un en JSON et un en CSV) pour pouvoir les utiliser plus tard dans l’évaluation.

In [None]:
import json
import os
import sys
from typing import List, Dict
import pandas as pd

# ====== Dépendances NLP ======
# spacy + modèle fr
import spacy
from spacy.util import get_package_path
from spacy.cli import download as spacy_download

# stanza
import stanza

# transformers (BERT WordPiece)
from transformers import BertTokenizer


# ==============================
# 0) Chargement des modèles
# ==============================
def load_spacy_fr():
    try:
        return spacy.load("fr_core_news_sm")
    except OSError:
        print("⚠️ Modèle SpaCy 'fr_core_news_sm' introuvable. Téléchargement...")
        spacy_download("fr_core_news_sm")
        return spacy.load("fr_core_news_sm")

def load_stanza_fr():
    try:
        # essaie de créer le pipeline; si les ressources manquent, on télécharge
        return stanza.Pipeline("fr", processors="tokenize", use_gpu=False, tokenize_no_ssplit=True)
    except Exception:
        print("⚠️ Ressources Stanza FR introuvables. Téléchargement...")
        stanza.download("fr")
        return stanza.Pipeline("fr", processors="tokenize", use_gpu=False, tokenize_no_ssplit=True)

def load_bert_tokenizer():
    # mBERT = WordPiece (ce qu'on veut ici)
    return BertTokenizer.from_pretrained("bert-base-multilingual-cased")


nlp_spacy = load_spacy_fr()
nlp_stanza = load_stanza_fr()
bert_wp = load_bert_tokenizer()


# ==============================
# 1) Fonctions de tokenisation
# ==============================
def tokenize_spacy(text: str) -> List[str]:
    return [t.text for t in nlp_spacy(text)]

def tokenize_stanza(text: str) -> List[str]:
    doc = nlp_stanza(text)
    return [w.text for s in doc.sentences for w in s.words]

def tokenize_bert_wordpiece_as_words(text: str) -> List[str]:
    """
    Tokenisation WordPiece puis recollage des sous-mots '##' pour revenir à des mots entiers.
    Ex: ['cin', '##éma'] -> ['cinéma']
    """
    pieces = bert_wp.tokenize(text)
    words = []
    cur = ""
    for p in pieces:
        if p.startswith("##"):
            cur += p[2:]
        else:
            if cur:
                words.append(cur)
            cur = p
    if cur:
        words.append(cur)
    return words


# ==============================
# 2) I/O helpers
# ==============================
LEVELS = ["original", "bruit_faible", "bruit_moyen", "bruit_fort"]

def safe_get(entry: Dict, key: str) -> str:
    # on garde exactement le texte fourni (y compris apostrophes/guillemets/espaces)
    return entry.get(key, "")

def list_to_space_string(tokens: List[str]) -> str:
    return " ".join(tokens)


# ==============================
# 3) Traitement principal
# ==============================
def apply_tokenizers_on_json(input_json: str,
                             output_json: str = "dataset_bruit_tokenized.json",
                             output_csv: str = "dataset_bruit_tokenized.csv") -> None:
    # Lire l'entrée
    with open(input_json, "r", encoding="utf-8") as f:
        data = json.load(f)

    out_json_records = []
    csv_rows = []

    for entry in data:
        # On conserve tel quel : original, reference_tokens et les 3 versions bruitées
        item = {
            "original": safe_get(entry, "original"),
            "reference_tokens": entry.get("reference_tokens", []),
            "bruit_faible": safe_get(entry, "bruit_faible"),
            "bruit_moyen": safe_get(entry, "bruit_moyen"),
            "bruit_fort": safe_get(entry, "bruit_fort"),
        }

        # Dictionnaires pour stocker les sorties des 3 tokenizers
        spacy_out = {}
        stanza_out = {}
        bert_out = {}

        # Appliquer sur original + bruits
        for lvl in LEVELS:
            txt = item[lvl] if lvl != "original" else item["original"]

            spacy_out[lvl]  = tokenize_spacy(txt)
            stanza_out[lvl] = tokenize_stanza(txt)
            bert_out[lvl]   = tokenize_bert_wordpiece_as_words(txt)

        # Ajouter aux données JSON
        item["spacy"] = spacy_out
        item["stanza"] = stanza_out
        item["bert"] = bert_out
        out_json_records.append(item)

        # Construire la ligne CSV (tokens en chaîne avec espaces)
        row = {
            "original": item["original"],
            "bruit_faible": item["bruit_faible"],
            "bruit_moyen": item["bruit_moyen"],
            "bruit_fort": item["bruit_fort"],
            # référence en chaîne pour inspection rapide
            "reference_tokens": list_to_space_string(item["reference_tokens"]) if item["reference_tokens"] else ""
        }
        for tok_name, tok_dict in [("spacy", spacy_out), ("stanza", stanza_out), ("bert", bert_out)]:
            for lvl in LEVELS:
                row[f"{tok_name}_{lvl}"] = list_to_space_string(tok_dict[lvl])

        csv_rows.append(row)

    # Sauvegardes
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(out_json_records, f, ensure_ascii=False, indent=2)
    print(f"✅ JSON écrit → {output_json}")

    df = pd.DataFrame(csv_rows)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"✅ CSV écrit → {output_csv}")


# ==============================
# 4) Lancement
# ==============================
if __name__ == "__main__":
    # adapte si nécessaire
    INPUT = "dataset_bruits.json"
    apply_tokenizers_on_json(INPUT,
                             output_json="dataset_bruit_tokenized.json",
                             output_csv="dataset_bruit_tokenized.csv")


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: fr (French):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Done loading processors!


✅ JSON écrit → dataset_bruit_tokenized.json
✅ CSV écrit → dataset_bruit_tokenized.csv


In [None]:
!pip install spacy
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.1 stanza-1.10.1
