In [7]:
import random
import json
import csv

# FONCTIONS DE BRUIT

def bruit_ocr(text):
    # Remplace certains caractères le bruit d'OCR
    return text.replace("l", "1").replace("O", "0").replace("o", "0").replace("I", "1").replace("œ", "oe")

def bruit_mots_colles(text):
    mots = text.split()
    if len(mots) > 2:
        i = random.randint(0, len(mots)-2)
        mots[i] = mots[i] + mots[i+1]
        del mots[i+1]
    return " ".join(mots)

def bruit_fraction(text):
    mots = text.split()
    i = random.randint(0, len(mots)-1)
    mot = mots[i]
    if len(mot) > 4:
        cut = len(mot)//2
        mots[i] = mot[:cut] + " " + mot[cut:]
    return " ".join(mots)

def bruit_apostrophe(text):
    # Modifie l'apostrophe
    return text.replace("'", "’").replace("’", "ʼ")

def bruit_tiret(text):
    return text.replace("-", "—")

def bruit_points(text):
    return text.replace("...", "…")

def bruit_guillemets(text):
    return text.replace('"', '«').replace("'", "»")

def bruit_note(text):
    mots = text.split()
    i = random.randint(0, len(mots)-1)
    mots[i] = mots[i] + str(random.randint(1,9))
    return " ".join(mots)

BRUITS = [bruit_ocr, bruit_mots_colles, bruit_fraction, bruit_apostrophe,
          bruit_tiret, bruit_points, bruit_guillemets, bruit_note]

def appliquer_bruit(text, niveau="faible"):
    if niveau == "faible":
        f = random.choice(BRUITS)
        return f(text)
    elif niveau == "moyen":
        fs = random.sample(BRUITS, 2)
        for f in fs:
            text = f(text)
        return text
    elif niveau == "fort":
        fs = random.sample(BRUITS, 4)
        for f in fs:
            text = f(text)
        return text
    return text

# EXTRACTION DES PHRASES

def extraire_phrases_conllu(fichier, n=1000):
    phrases = []
    current_tokens = []
    current_text = None

    with open(fichier, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("# text ="):
                # Sauvegarde la phrase précédente
                if current_text and current_tokens:
                    phrases.append((current_text, current_tokens))
                # Nouvelle phrase
                current_text = line.split("=", 1)[1].strip()
                current_tokens = []
            elif line and not line.startswith("#"):
                parts = line.split("\t")
                if len(parts) > 1 and parts[0].isdigit() and "-" not in parts[0] and "." not in parts[0]:
                    current_tokens.append(parts[1])
        # Ajouter la dernière phrase
        if current_text and current_tokens:
            phrases.append((current_text, current_tokens))

    # Sélection : début, milieu, fin
    total = len(phrases)
    selection = phrases[:n//3] + phrases[total//2:total//2 + n//3] + phrases[-n//3:]
    return selection[:n]

# dataset complet

def generer_datasets(fichier_conllu, sortie_json="dataset_bruite.json", sortie_csv="dataset_bruite.csv", n=1000):
    phrases = extraire_phrases_conllu(fichier_conllu, n=n)
    dataset = []

    for phrase, tokens in phrases:
        entree = {
            "original": phrase,
            "reference_tokens": tokens,
            "bruit_faible": appliquer_bruit(phrase, "faible"),
            "bruit_moyen": appliquer_bruit(phrase, "moyen"),
            "bruit_fort": appliquer_bruit(phrase, "fort"),
        }
        dataset.append(entree)

    # Sauvegarde JSON
    with open(sortie_json, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    # Sauvegarde CSV
    with open(sortie_csv, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["original", "reference_tokens", "bruit_faible", "bruit_moyen", "bruit_fort"])
        for entree in dataset:
            writer.writerow([
                entree["original"],
                " ".join(entree["reference_tokens"]),
                entree["bruit_faible"],
                entree["bruit_moyen"],
                entree["bruit_fort"]
            ])

    print(f"✅ JSON généré : {sortie_json}")
    print(f"✅ CSV généré : {sortie_csv} avec {len(dataset)} phrases.")

# LANCEMENT

generer_datasets("fr_gsd-ud-train.conllu", n=1000)


✅ JSON généré : dataset_bruite.json
✅ CSV généré : dataset_bruite.csv avec 1000 phrases.
