In [1]:
import pandas as pd
import numpy as np
import re
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Dropout, Embedding
from keras import optimizers
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# 1. Chargement des données
train_data = pd.read_csv('SQLIV3_cleaned2.csv')
test_data = pd.read_csv('sqliv2_utf8.csv')

# 2. Suppression des doublons (en gardant la première occurrence)
train_data.drop_duplicates(subset='Sentence', keep='first', inplace=True)
test_data.drop_duplicates(subset='Sentence', keep='first', inplace=True)

# 3. Nettoyage MINIMAL (on conserve les caractères spéciaux !)
def clean_text(text):
    text = str(text).strip()  # Conversion en string + suppression espaces inutiles
    return text

train_data['Sentence'] = train_data['Sentence'].apply(clean_text)
test_data['Sentence'] = test_data['Sentence'].apply(clean_text)

# 4. Tokenisation (on garde tous les caractères)
vocab_size = 15000  # Vocabulaire large pour les motifs SQL
tokenizer = Tokenizer(
    num_words=vocab_size,
    oov_token="<OOV>",
    filters='',       # ← AUCUN filtre (conserve ', ", ;, -- etc.)
    lower=False       # ← Conserve la casse (important pour SQL)
)
tokenizer.fit_on_texts(train_data['Sentence'])

# 5. Padding adaptatif
max_len = int(np.percentile([len(x.split()) for x in train_data['Sentence']], 95))
X = tokenizer.texts_to_sequences(train_data['Sentence'])
X = pad_sequences(X, padding='post', maxlen=max_len)
y = train_data['Label'].astype('int')

# 6. Split train/validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Architecture du modèle
embedding_dim = 256  # Grande dimension pour les caractères spéciaux
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    SimpleRNN(256, return_sequences=True),  # Couche 1: capture les motifs locaux
    Dropout(0.3),
    SimpleRNN(128),                         # Couche 2: agrège les motifs
    Dense(64, activation='relu'),           # Couche dense intermédiaire
    Dense(1, activation='sigmoid')          # Sortie binaire
])

# 8. Optimisation
optimizer = optimizers.Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# 9. Entraînement
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=64,
    validation_data=(X_val, y_val),
)

# 10. Évaluation
X_test = tokenizer.texts_to_sequences(test_data['Sentence'])
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
test_loss, test_accuracy = model.evaluate(X_test, test_data['Label'].astype('int'))
print(f'\nTest Accuracy: {test_accuracy:.4f}')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Test Accuracy: 0.9933


In [2]:
from sklearn.metrics import classification_report

# 1. Récupérer uniquement les lignes spam (Label = 1)
spam_only = test_data[test_data['Label'] == 1].copy()
print(f"\n📌 Nombre total de requêtes spam dans test_data : {len(spam_only)}")

# 2. Nettoyage si besoin
spam_only['Sentence'] = spam_only['Sentence'].apply(clean_text)

# 3. Tokenisation + Padding
X_spam = tokenizer.texts_to_sequences(spam_only['Sentence'])
X_spam = pad_sequences(X_spam, padding='post', maxlen=max_len)

# 4. Prédiction
spam_preds = model.predict(X_spam)
spam_preds_labels = (spam_preds > 0.5).astype(int)

# 5. Calcul du nombre de spams correctement détectés
true_positives = np.sum(spam_preds_labels == 1)
total_spams = len(spam_only)
detection_rate = (true_positives / total_spams) * 100

# 6. Affichage
print(f"✅ Spams correctement détectés : {true_positives}")
print(f"📊 Taux de détection : {detection_rate:.2f}%")



📌 Nombre total de requêtes spam dans test_data : 11424
✅ Spams correctement détectés : 11353
📊 Taux de détection : 99.38%
