## **Etape 1**

In [2]:
# === Bloc 0 — Config & lecture unique CSV ===
# Chemin et découpe globale (tu peux changer N_ROWS ici, pour toutes les étapes)
CSV_PATH = "/content/train.csv"
N_ROWS   = 3000  # <- règle ici le nombre de lignes utilisées dans tout le notebook

# Install (Colab) — versions raisonnables et compatibles
# !pip -q install -U spacy tqdm scikit-learn tensorflow==2.16.1

import pandas as pd
import re
import spacy
from tqdm import tqdm

# Modèle spaCy EN (souvent adapté aux datasets toxiques Kaggle)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# Lecture unique du CSV
df_all = pd.read_csv(CSV_PATH, dtype=str, keep_default_na=False)
dfN = df_all.head(N_ROWS).copy()

print(f"Fichier chargé: {CSV_PATH}")
print(f"Lignes utilisées pour tout le notebook: {len(dfN)} (N_ROWS={N_ROWS})")
print("\nAperçu (id, comment_text) — 5 premières lignes :")
print(dfN.loc[:, ["id","comment_text"]].head(5).to_string(index=False))


Fichier chargé: /content/train.csv
Lignes utilisées pour tout le notebook: 3000 (N_ROWS=3000)

Aperçu (id, comment_text) — 5 premières lignes :
              id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       comment_text
0000997932d777bf                                                                                                                                                                                                    

In [3]:
# === Bloc 1 — Params & Regex (DCP) ===
USE_LABEL_TOKENS = True  # True => PERSON/EMAIL/... ; False => ****

REGEX_PATTERNS = {
    "EMAIL": re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"),
    "IP":    re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
    "URL":   re.compile(r"https?://\S+|www\.\S+"),
    "PHONE": re.compile(r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}\b"),

}


In [4]:
# === Bloc 2 — Fonction d’anonymisation (DCP uniquement) ===
# Ne masque via spaCy que PERSON (pas ORG/GPE...)

# Compléments regex utiles (adresse postale, CB, @handle)
ADDRESS_RE = re.compile(
    r"\b\d{1,5}\s+(?:rue|avenue|av\.?|bd|boulevard|impasse|allée|route|chemin|che\.?|place|pl\.?|quai|square|sq\.?|"
    r"street|st\.?|ave\.?|road|rd\.?|blvd\.?|lane|ln\.?|drive|dr\.?|court|ct\.?)\s+[A-Za-zÀ-ÖØ-öø-ÿ'’\-\. ]+\b",
    re.IGNORECASE
)
CREDIT_CARD_RE = re.compile(r"\b(?:\d[ -]*?){13,19}\b")
USERNAME_RE    = re.compile(r"(?<!\w)@[\w._\-]{2,32}")

PII_LABELS = ["EMAIL","IP","PHONE","ADDRESS","CREDIT_CARD","USERNAME","PERSON"]

def _mask(label: str) -> str:
    return label if USE_LABEL_TOKENS else "****"

def anonymize_text(text: str, counts: dict) -> str:
    if not text:
        return text
    tmp = text

    # 1) Regex DCP
    for label in ["EMAIL","IP","PHONE"]:
        rx = REGEX_PATTERNS[label]
        def repl(m, L=label):
            counts[L] = counts.get(L, 0) + 1
            return _mask(L)
        tmp = rx.sub(repl, tmp)

    # Adresse
    def addr_repl(m):
        counts["ADDRESS"] = counts.get("ADDRESS", 0) + 1
        return _mask("ADDRESS")
    tmp = ADDRESS_RE.sub(addr_repl, tmp)

    # CB avec filtre minimal
    def cc_repl(m):
        digits = re.sub(r"\D", "", m.group(0))
        if 13 <= len(digits) <= 19:
            counts["CREDIT_CARD"] = counts.get("CREDIT_CARD", 0) + 1
            return _mask("CREDIT_CARD")
        return m.group(0)
    tmp = CREDIT_CARD_RE.sub(cc_repl, tmp)

    # @username
    def user_repl(m):
        counts["USERNAME"] = counts.get("USERNAME", 0) + 1
        return _mask("USERNAME")
    tmp = USERNAME_RE.sub(user_repl, tmp)

    # 2) spaCy NER — PERSON uniquement
    doc = nlp(tmp)
    out, last = [], 0
    for ent in doc.ents:
        if ent.start_char > last:
            out.append(tmp[last:ent.start_char])
        if ent.label_ == "PERSON":
            counts["PERSON"] = counts.get("PERSON", 0) + 1
            out.append(_mask("PERSON"))
            last = ent.end_char
    out.append(tmp[last:])
    anonymized = "".join(out)

    # Nettoyage: éviter PERSON PERSON consécutifs
    anonymized = re.sub(r"(PERSON)(?:\s*,?\s*PERSON)+", "PERSON", anonymized)
    return anonymized


In [5]:
# === Bloc 3 — Application & stats DCP ===
assert "dfN" in globals(), "dfN absent. Exécute d'abord le Bloc 0 (lecture CSV)."

counts_global = {}
anon_list = []

for _, row in tqdm(dfN.iterrows(), total=len(dfN)):
    local = {}
    anon_list.append(anonymize_text(row.get("comment_text",""), local))
    for k, v in local.items():
        counts_global[k] = counts_global.get(k, 0) + v

dfN["comment_text_anonymized"] = anon_list

print("\nAperçu anonymisé (5 premières lignes) :")
print(dfN.loc[:, ["id","comment_text_anonymized"]].head(5).to_string(index=False))

print("\nComptage global DCP :")
for k in sorted([k for k in counts_global if k in PII_LABELS]):
    print(f"  - {k:>11}: {counts_global[k]}")


100%|██████████| 3000/3000 [01:05<00:00, 45.50it/s]


Aperçu anonymisé (5 premières lignes) :
              id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               




In [6]:
# === Bloc 4 — Comparaison (tronquée pour lisibilité) ===
def short(s, n=140):
    s = str(s).replace("\n", " ")
    return (s[:n] + "…") if len(s) > n else s

preview_cols = ["id", "comment_text", "comment_text_anonymized"]
preview = dfN.loc[:, preview_cols].copy()
preview["comment_text"] = preview["comment_text"].apply(lambda x: short(x, 120))
preview["comment_text_anonymized"] = preview["comment_text_anonymized"].apply(lambda x: short(x, 120))

print("Original ↔ Anonymisé (10 premières lignes) :")
print(preview.head(10).to_string(index=False))


Original ↔ Anonymisé (10 premières lignes) :
              id                                                                                                              comment_text                                                                                                   comment_text_anonymized
0000997932d777bf Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just clo… Explanation Why the edits made under my username Explanation Why the edits made under my username Hardcore Metallica Fan…
000103f0d9cfb60f          D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC) D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) D'aww! He matches this background col…
000113f07ec002fd Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talk… Hey man, I'm really not trying to

In [7]:
# === Bloc 5 — Registre (imprimé) ===
from datetime import datetime

print("\n=== REGISTRE (imprimé) — Anonymisation DCP uniquement ===")
print(f"Date UTC         : {datetime.utcnow().isoformat()}Z")
print(f"Fichier traité   : {CSV_PATH} (N_ROWS={len(dfN)})")
print("Modèle NER       : spaCy en_core_web_sm (PERSON uniquement)")
print("Catégories DCP   : PERSON, EMAIL, PHONE, ADDRESS, IP, USERNAME, CREDIT_CARD")
print(f"Sortie/format    : {'Labels (PERSON/EMAIL/...)' if USE_LABEL_TOKENS else '**** (masquage intégral)'}")

print("\nComptage DCP :")
for k in ["PERSON","EMAIL","PHONE","ADDRESS","IP","USERNAME","CREDIT_CARD"]:
    if k in counts_global:
        print(f"  - {k:>11}: {counts_global[k]}")
print("\nNote : vérifier manuellement un échantillon — les adresses postales sont difficiles à capter parfaitement.")



=== REGISTRE (imprimé) — Anonymisation DCP uniquement ===
Date UTC         : 2025-11-06T13:35:58.016468Z
Fichier traité   : /content/train.csv (N_ROWS=3000)
Modèle NER       : spaCy en_core_web_sm (PERSON uniquement)
Catégories DCP   : PERSON, EMAIL, PHONE, ADDRESS, IP, USERNAME, CREDIT_CARD
Sortie/format    : Labels (PERSON/EMAIL/...)

Comptage DCP :
  -      PERSON: 2254
  -       EMAIL: 9
  -       PHONE: 55
  -          IP: 206
  -    USERNAME: 3
  - CREDIT_CARD: 1

Note : vérifier manuellement un échantillon — les adresses postales sont difficiles à capter parfaitement.


  print(f"Date UTC         : {datetime.utcnow().isoformat()}Z")


## **Etape 2**

In [8]:
# === Bloc 1 — Nettoyage + Y multilabel ===
import numpy as np
from sklearn.model_selection import train_test_split

# Colonne texte : si anonymisé existe, on l'utilise
text_col = "comment_text_anonymized" if "comment_text_anonymized" in dfN.columns else "comment_text"

label_cols = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
for c in label_cols:
    dfN[c] = pd.to_numeric(dfN[c], errors="coerce").fillna(0).astype(int)

EMOJI_RE = re.compile(r"[\U00010000-\U0010ffff]", flags=re.UNICODE)
URL_RE   = re.compile(r"https?://\S+|www\.\S+")
def clean_text(s: str) -> str:
    s = str(s).lower()
    s = URL_RE.sub(" ", s)
    s = EMOJI_RE.sub(" ", s)
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

dfN["text_clean"] = dfN[text_col].apply(clean_text)
X = dfN["text_clean"].astype(str).tolist()
Y = dfN[label_cols].values

print("Aperçu (texte nettoyé) — 5 premières lignes :")
print(dfN.loc[:, ["id", text_col, "text_clean"]].head(5).to_string(index=False))


Aperçu (texte nettoyé) — 5 premières lignes :
              id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [9]:
# === Bloc 2 — Split train/val ===
X_train, X_val, Y_train, Y_val = train_test_split(
    X, Y, test_size=0.3, random_state=42
)

print("Taille train/val :", len(X_train), "/", len(X_val))
print("\nDistribution des labels (somme 1/0) — TRAIN :")
print(pd.Series(Y_train.sum(axis=0), index=label_cols).to_string())
print("\nDistribution des labels (somme 1/0) — VAL :")
print(pd.Series(Y_val.sum(axis=0), index=label_cols).to_string())


Taille train/val : 2100 / 900

Distribution des labels (somme 1/0) — TRAIN :
toxic            198
severe_toxic      25
obscene          104
threat            11
insult           102
identity_hate     17

Distribution des labels (somme 1/0) — VAL :
toxic            109
severe_toxic      13
obscene           63
threat             3
insult            62
identity_hate     15


In [10]:
# === Bloc 3 — Modèle LSTM bi-directionnel (unique, pas de baseline) ===
import time
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from sklearn.metrics import precision_score, recall_score, f1_score

MAX_VOCAB  = 8000
MAX_LEN    = 120
tokenizer  = Tokenizer(num_words=MAX_VOCAB, oov_token="<unk>")
tokenizer.fit_on_texts(X_train)

Xtr_seq = tokenizer.texts_to_sequences(X_train)
Xva_seq = tokenizer.texts_to_sequences(X_val)
Xtr_pad = pad_sequences(Xtr_seq, maxlen=MAX_LEN, padding="post", truncating="post")
Xva_pad = pad_sequences(Xva_seq, maxlen=MAX_LEN, padding="post", truncating="post")

model = models.Sequential([
    layers.Embedding(input_dim=MAX_VOCAB, output_dim=64),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dropout(0.2),
    layers.Dense(64, activation="relu"),
    layers.Dense(len(label_cols), activation="sigmoid")  # multilabel
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[])

EPOCHS = 8
BATCH  = 8
t0 = time.perf_counter()
hist = model.fit(
    Xtr_pad, Y_train,
    validation_data=(Xva_pad, Y_val),
    epochs=EPOCHS, batch_size=BATCH, verbose=0
)
train_time_lstm = time.perf_counter() - t0

t1 = time.perf_counter()
Yp = model.predict(Xva_pad, verbose=0)
pred_time_lstm = time.perf_counter() - t1
Yhat = (Yp >= 0.5).astype(int)

prec_micro_l = precision_score(Y_val, Yhat, average="micro",  zero_division=0)
rec_micro_l  = recall_score(Y_val,  Yhat, average="micro",    zero_division=0)
f1_micro_l   = f1_score(Y_val,      Yhat, average="micro",    zero_division=0)
prec_macro_l = precision_score(Y_val, Yhat, average="macro",  zero_division=0)
rec_macro_l  = recall_score(Y_val,  Yhat, average="macro",    zero_division=0)
f1_macro_l   = f1_score(Y_val,      Yhat, average="macro",    zero_division=0)

print("=== Modèle (BiLSTM) ===")
print(f"Temps entraînement : {train_time_lstm:.3f} s | Temps prédiction : {pred_time_lstm:.3f} s")
print(f"Micro: P={prec_micro_l:.3f} R={rec_micro_l:.3f} F1={f1_micro_l:.3f}")
print(f"Macro: P={prec_macro_l:.3f} R={rec_macro_l:.3f} F1={f1_macro_l:.3f}")

print("\nHistorique entraînement (dernière epoch) :")
print({k: float(v[-1]) for k, v in hist.history.items()})


=== Modèle (BiLSTM) ===
Temps entraînement : 309.007 s | Temps prédiction : 1.489 s
Micro: P=0.671 R=0.392 F1=0.495
Macro: P=0.436 R=0.231 F1=0.290

Historique entraînement (dernière epoch) :
{'loss': 0.02974589169025421, 'val_loss': 0.2129724621772766}


## **Etape 3**

In [11]:
# === Bloc 1 — Export artefacts (.keras, tokenizer, labels, preprocess) ===
import os, json, pathlib

try:
    model, tokenizer, label_cols
except NameError:
    raise RuntimeError("Modèle/tokenizer/labels introuvables. Exécuter l'Étape 2 (Blocs 1–3).")

# Reprise du clean_text de l'étape 2 pour le module preprocess
EMOJI_RE = re.compile(r"[\U00010000-\U0010ffff]", flags=re.UNICODE)
URL_RE   = re.compile(r"https?://\S+|www\.\S+")
def clean_text(s: str) -> str:
    s = str(s).lower()
    s = URL_RE.sub(" ", s)
    s = EMOJI_RE.sub(" ", s)
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

EXPORT_DIR = "/content/lstm_service"
os.makedirs(EXPORT_DIR, exist_ok=True)

# 1) Modèle
model_path = os.path.join(EXPORT_DIR, "model.keras")
model.save(model_path)

# 2) Tokenizer
with open(os.path.join(EXPORT_DIR, "tokenizer.json"), "w", encoding="utf-8") as f:
    f.write(tokenizer.to_json())

# 3) Labels
with open(os.path.join(EXPORT_DIR, "labels.txt"), "w", encoding="utf-8") as f:
    for lab in label_cols:
        f.write(lab + "\n")

# 4) Module preprocess
with open(os.path.join(EXPORT_DIR, "preprocess.py"), "w", encoding="utf-8") as f:
    f.write('''import re
EMOJI_RE = re.compile(r"[\\U00010000-\\U0010ffff]", flags=re.UNICODE)
URL_RE   = re.compile(r"https?://\\S+|www\\.\\S+")
def clean_text(s: str) -> str:
    s = str(s).lower()
    s = URL_RE.sub(" ", s)
    s = EMOJI_RE.sub(" ", s)
    s = re.sub(r"[^a-z0-9\\s']", " ", s)
    s = re.sub(r"\\s+", " ", s).strip()
    return s
''')

print("Export terminé. Fichiers :")
print("\n".join([str(p) for p in pathlib.Path(EXPORT_DIR).glob('*')]))


Export terminé. Fichiers :
/content/lstm_service/preprocess.py
/content/lstm_service/model.keras
/content/lstm_service/labels.txt
/content/lstm_service/tokenizer.json


In [12]:
# === Bloc 2 — API FastAPI (pour .keras) ===
import os
from textwrap import dedent

app_code = dedent("""
import os, json
from typing import List, Dict
from fastapi import FastAPI
from pydantic import BaseModel
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from preprocess import clean_text

MAX_LEN = 120
MODEL_PATH = "model.keras"

with open("tokenizer.json", "r", encoding="utf-8") as f:
    tok_json = f.read()
tokenizer = tokenizer_from_json(tok_json)

with open("labels.txt", "r", encoding="utf-8") as f:
    LABELS = [line.strip() for line in f if line.strip()]

model = tf.keras.models.load_model(MODEL_PATH)

app = FastAPI(title="Toxic Comment LSTM API", version="1.0")

class PredictIn(BaseModel):
    texts: List[str]

class PredictOut(BaseModel):
    scores: List[Dict[str, float]]

@app.get("/health")
def health():
    return {"status": "ok", "labels": LABELS}

@app.post("/predict", response_model=PredictOut)
def predict(payload: PredictIn):
    cleaned = [clean_text(t) for t in payload.texts]
    seqs = tokenizer.texts_to_sequences(cleaned)
    pad = pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")
    preds = model.predict(pad, verbose=0)
    out = []
    for row in preds.tolist():
        out.append({LABELS[i]: float(row[i]) for i in range(len(LABELS))})
    return PredictOut(scores=out)
""")

with open(os.path.join(EXPORT_DIR, "app.py"), "w", encoding="utf-8") as f:
    f.write(app_code)

reqs = """fastapi==0.115.0
uvicorn[standard]==0.30.6
tensorflow==2.16.1
"""
with open(os.path.join(EXPORT_DIR, "requirements.txt"), "w", encoding="utf-8") as f:
    f.write(reqs)

print("Fichiers API écrits : app.py, requirements.txt")


Fichiers API écrits : app.py, requirements.txt


In [13]:
# === Bloc 3 — Test offline (.keras) ===
import json
import numpy as np, tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

with open(os.path.join(EXPORT_DIR, "tokenizer.json"), "r", encoding="utf-8") as f:
    tok_json = f.read()
tok2 = tokenizer_from_json(tok_json)

with open(os.path.join(EXPORT_DIR, "labels.txt"), "r", encoding="utf-8") as f:
    labels2 = [l.strip() for l in f if l.strip()]

model2 = tf.keras.models.load_model(os.path.join(EXPORT_DIR, "model.keras"))

# clean local aligné
EMOJI_RE = re.compile(r"[\U00010000-\U0010ffff]", flags=re.UNICODE)
URL_RE   = re.compile(r"https?://\S+|www\.\S+")
def clean_text_local(s):
    s = s.lower()
    s = URL_RE.sub(" ", s)
    s = EMOJI_RE.sub(" ", s)
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

samples = [
    "you are beauftiful",
    "you are horrible",
]
cleaned = [clean_text_local(t) for t in samples]
seqs = tok2.texts_to_sequences(cleaned)
pad = pad_sequences(seqs, maxlen=120, padding="post", truncating="post")
preds = model2.predict(pad, verbose=0)

print("Sortie prédite (scores par label) :")
for i, sc in enumerate(preds):
    print(f"- Ex{i+1}:")
    print({labels2[j]: float(sc[j]) for j in range(len(labels2))})


Sortie prédite (scores par label) :
- Ex1:
{'toxic': 0.7940958738327026, 'severe_toxic': 0.003181144827976823, 'obscene': 0.08463543653488159, 'threat': 0.012225757353007793, 'insult': 0.05774860829114914, 'identity_hate': 0.008261489681899548}
- Ex2:
{'toxic': 0.8216884732246399, 'severe_toxic': 0.003536294214427471, 'obscene': 0.09045542031526566, 'threat': 0.01316509023308754, 'insult': 0.0632026344537735, 'identity_hate': 0.009455006569623947}


In [14]:
# === Bloc 4 — Dockerfile (pour déploiement) ===
dockerfile = """
FROM python:3.10-slim

RUN apt-get update && apt-get install -y --no-install-recommends build-essential && rm -rf /var/lib/apt/lists/*

WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt

# Artefacts
COPY model.keras /app/model.keras
COPY tokenizer.json /app/tokenizer.json
COPY labels.txt /app/labels.txt
COPY preprocess.py /app/preprocess.py
COPY app.py /app/app.py

EXPOSE 8080
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"]
"""
with open(os.path.join(EXPORT_DIR, "Dockerfile"), "w", encoding="utf-8") as f:
    f.write(dockerfile)

print("Dockerfile écrit :", os.path.join(EXPORT_DIR, "Dockerfile"))


Dockerfile écrit : /content/lstm_service/Dockerfile


## **GitHub**

In [15]:
# Colab: installer git & git-lfs
!apt-get -qq update
!apt-get -qq install -y git git-lfs
!git lfs install

# Config Git (mets ton nom et ton email GitHub)
!git config --global user.name "Willy772"
!git config --global user.email "tallawilly@icloud.com"


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Git LFS initialized.
