### Étape 1 : Exploration, analyse et anonymisation des données


In [None]:
import pandas as pd
import random
import string

#  Charger le dataset
df = pd.read_csv("/content/test.csv")  # adapte le chemin si nécessaire

#  Fonction pour générer une adresse mail fictive
def generate_email(name=None):
    domains = ["gmail.com", "yahoo.com", "outlook.com", "example.com"]
    if name and isinstance(name, str):
        base = name.lower().replace(" ", ".").replace("_", ".")
    else:
        base = ''.join(random.choices(string.ascii_lowercase, k=7))
    return f"{base}{random.randint(1,999)}@{random.choice(domains)}"

#  Ajouter la colonne "email"
# Si ton dataset a une colonne de noms (ex: "Name", "Nom"), remplace "Name" ci-dessous.
if "Name" in df.columns:
    df["email"] = df["Name"].apply(generate_email)
elif "Nom" in df.columns:
    df["email"] = df["Nom"].apply(generate_email)
else:
    df["email"] = [generate_email() for _ in range(len(df))]

#  Sauvegarder le nouveau dataset
df.to_csv("/content/test_with_emails.csv", index=False)

#  Afficher les premières lignes
df


Unnamed: 0,id,comment_text,email
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,briieud260@outlook.com
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,uayuagg135@example.com
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",dpykzqe815@yahoo.com
3,00017563c3f7919a,":If you have a look back at the source, the in...",mkhiajh964@gmail.com
4,00017695ad8997eb,I don't anonymously edit articles at all.,nesetsj771@gmail.com
...,...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu...",eulnhbc686@example.com
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...,hbshpig357@outlook.com
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ...",nlfbsyq552@example.com
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the...",htfhakh996@example.com


In [None]:
import pandas as pd
import hashlib


file_path = "/content/test_with_emails.csv"
df = pd.read_csv(file_path)
print(" Fichier chargé avec succès !")

# 2️ Fonction pour anonymiser un e-mail avec un hash SHA-256
def anonymize_email(email):
    if isinstance(email, str):
        return hashlib.sha256(email.encode('utf-8')).hexdigest()
    else:
        return None

# 3️ Application de l’anonymisation sur la colonne "email"
if "email" in df.columns:
    df["email_anonymized"] = df["email"].apply(anonymize_email)
    # Supprimer la colonne d’origine pour protéger les données
    df = df.drop(columns=["email"])
else:
    print("⚠️ La colonne 'email' n’existe pas dans le dataset.")

# 4️ Sauvegarde du dataset anonymisé
output_path = "/content/test_with_anonymized_emails.csv"
df.to_csv(output_path, index=False)
print(f" Fichier anonymisé sauvegardé ici : {output_path}")

# 5️ Aperçu des premières lignes
df.head(100)


 Fichier chargé avec succès !
 Fichier anonymisé sauvegardé ici : /content/test_with_anonymized_emails.csv


Unnamed: 0,id,comment_text,email_anonymized
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,4df541f726a3f3b464ccccd9853e4e790773bc7c1e4ae4...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,b564081be97895257b258ed5bed7856d1d83587e0caf95...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",6452e8cfcd73b981258858d09aa868199501beebe9957d...
3,00017563c3f7919a,":If you have a look back at the source, the in...",37ee5e260e4f53470315073587b29cf7d69f967e81ee3a...
4,00017695ad8997eb,I don't anonymously edit articles at all.,9807c0e7428c0404b6045bbc857ae5dbce4321dde8f0ce...
...,...,...,...
95,0023f3f84f353bce,""" \n\n == Main towns that are not so main == \...",a94971bab415d6289fb507f74d71329872fff1f02fb466...
96,002586bdf3280356,""" \n\n my comments follow, bluewillow991967 -...",ed5925e1a400133cffc5b614dfd21e0b4e31a46f6e0368...
97,0025a91b6955f1a5,""" \n\n == Halliday == \n\n Good to see another...",1b25bb4b879f2d707543ad02796afe1a70f5bc65ba4f50...
98,0025c49d87d9a18f,""" \n ::: That Stephen Barrett is not Board Cer...",6138a46d3f6d6b83d96ec6318fbb04db359e76f07281fa...


## Charger le dataset

### Subtask:
Charger les données du fichier `train.csv` dans un DataFrame pandas.


In [None]:
df_train = pd.read_csv("/content/train.csv").head(100)
df_train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
#Charger le modèle spaCy en mémoire pour l'utiliser.
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
import pandas as pd
import spacy
import re

# Charger le modèle de langue spaCy (anglais)
nlp = spacy.load("en_core_web_sm")

# Charger le dataset
df_train = pd.read_csv("/content/train.csv").head(100)

# Liste des entités à anonymiser
TARGET_ENTITY_TYPES = {"PERSON", "GPE", "LOC"}

# Fonction pour anonymiser le texte
def anonymize_text(text):
    doc = nlp(text)
    entities = []
    masked_text = text

    # 1️⃣ Détection des entités nommées reconnues par spaCy
    for ent in doc.ents:
        if ent.label_ in TARGET_ENTITY_TYPES:
            entities.append(f"{ent.text} ({ent.label_})")
            masked_text = masked_text.replace(ent.text, f"[{ent.label_}]")

    # 2️⃣ Détection des emails
    email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
    emails = re.findall(email_pattern, text)
    for email in emails:
        entities.append(f"{email} (EMAIL)")
        masked_text = masked_text.replace(email, "[EMAIL]")

    # 3️⃣ Détection des usernames (type @username)
    username_pattern = r"@\w+"
    usernames = re.findall(username_pattern, text)
    for user in usernames:
        entities.append(f"{user} (USERNAME)")
        masked_text = masked_text.replace(user, "[USERNAME]")

    # 4️⃣ Détection d’un âge (ex: “25 years old” ou “aged 30”)
    age_pattern = r"\b\d{1,3}\s?(years? old|yrs|y/o|aged)\b"
    ages = re.findall(age_pattern, text)
    if ages:
        entities.append(f"{ages} (AGE)")
        masked_text = re.sub(age_pattern, "[AGE]", masked_text)

    # 5️⃣ Détection d’adresses postales simples (mot clé "Street", "Avenue", "Rd", etc.)
    address_pattern = r"\b\d{1,4}\s[\w\s]+(Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln)\b"
    addresses = re.findall(address_pattern, text)
    if addresses:
        entities.append(f"{addresses} (ADDRESS)")
        masked_text = re.sub(address_pattern, "[ADDRESS]", masked_text)

    return masked_text, ", ".join(entities)

# Appliquer la fonction sur la colonne 'comment_text'
df_train[["masked_comment_text", "personal_data"]] = df_train["comment_text"].apply(
    lambda x: pd.Series(anonymize_text(str(x)))
)

# Afficher le dataset complet
display(df_train.head(100))


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,masked_comment_text,personal_data
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,Explanation\nWhy the edits made under my usern...,New York Dolls (GPE)
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,D'aww! He matches this background colour I'm s...,
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"Hey man, I'm really not trying to edit war. It...",
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"""\nMore\nI can't make any real suggestions on ...",
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"You, sir, are my hero. Any chance you remember...",
...,...,...,...,...,...,...,...,...,...,...
95,003b9f448ee4a29d,"""\n\nThanks. I can see that violating clearly ...",0,0,0,0,0,0,"""\n\nThanks. I can see that violating clearly ...","James Petras' (PERSON), Carol Moore (PERSON)"
96,003bd094feef5263,"""\nHi\nThanks for our kind words. See you arou...",0,0,0,0,0,0,"""\nHi\nThanks for our kind words. See you arou...",
97,003caacc6ce6c9e9,Collusion in poker \n\nThis is regarded as mos...,0,0,0,0,0,0,Collusion in poker \n\nThis is regarded as mos...,
98,003d77a20601cec1,"Thanks much - however, if it's been resolved, ...",0,0,0,0,0,0,"Thanks much - however, if it's been resolved, ...",


## Etape 2 : Préparation et entraînement d’un modèle IA


Examinons les entités extraites pour identifier celles qui sont des informations personnelles et présentons les résultats

# Nettoyage du texte

In [None]:
import re
import pandas as pd
import spacy

# Charger le modèle anglais de spaCy (assure-toi qu'il est téléchargé)
# !python -m spacy download en_core_web_md
#nlp = spacy.load("en_core_web_md")

# --- Fonction de nettoyage du texte ---
def clean_text_en(text):
    # Convertir en minuscule
    text = text.lower()

    # Supprimer les emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r"", text)

    # Supprimer ponctuations et symboles spécifiques
    text = re.sub(r'[\\\/\-\,"\(\)\[\]\{\}\.\'!?:;_]', " ", text)

    # Supprimer les doubles espaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokeniser avec spaCy pour retirer les stopwords et garder seulement les mots utiles
    doc = nlp(text)
    cleaned_tokens = [token.text for token in doc if not token.is_stop]

    # Reconstituer le texte nettoyé
    return " ".join(cleaned_tokens)

# --- Application du nettoyage ---
df_train["masked_comment_text"] = df_train["masked_comment_text"].astype(str).apply(clean_text_en)

# --- Garder uniquement l'index et le texte nettoyé ---
df_cleaned = df_train.reset_index()[["index", "masked_comment_text"]]

# --- Afficher le résultat ---
display(df_cleaned.head(10))
#

Unnamed: 0,index,masked_comment_text
0,0,explanation edits username work art reverted w...
1,1,d aww matches background colour m seemingly st...
2,2,hey man m trying edit war s guy constantly rem...
3,3,t real suggestions improvement wondered sectio...
4,4,sir hero chance remember page s
5,5,congratulations use tools · talk
6,6,cocksucker piss work
7,7,vandalism person article reverted don t banned
8,8,sorry word nonsense offensive m intending writ...
9,9,alignment subject contrary person


# Entrainement du model