In [None]:
import pandas as pd
import numpy as np
import re
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras import optimizers
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# 1. Chargement des donn√©es
train_data = pd.read_csv('SQLIV3_cleaned2.csv')
test_data = pd.read_csv('sqliv2_utf8.csv')

# 2. Suppression des doublons (en gardant la premi√®re occurrence)
train_data.drop_duplicates(subset='Sentence', keep='first', inplace=True)
test_data.drop_duplicates(subset='Sentence', keep='first', inplace=True)

# 3. Affichage de la structure des datasets
print("Structure du train_data :")
print(train_data.info())
print("\nStatistiques :")
print(train_data.describe(include='all'))

print("\nStructure du test_data :")
print(test_data.info())

# 4. Nettoyage minimal (on garde les caract√®res sp√©ciaux)
def clean_text(text):
    return str(text).strip()

train_data['Sentence'] = train_data['Sentence'].apply(clean_text)
test_data['Sentence'] = test_data['Sentence'].apply(clean_text)

# 5. Tokenisation
vocab_size = 15000
tokenizer = Tokenizer(
    num_words=vocab_size,
    oov_token="<OOV>",
    filters='',        # Garde les caract√®res sp√©ciaux
    lower=False
)
tokenizer.fit_on_texts(train_data['Sentence'])

# 6. S√©quences et padding
max_len = int(np.percentile([len(x.split()) for x in train_data['Sentence']], 95))
X = tokenizer.texts_to_sequences(train_data['Sentence'])
X = pad_sequences(X, padding='post', maxlen=max_len)
y = train_data['Label'].astype('int')

# 7. Split train/val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. Architecture du mod√®le LSTM
embedding_dim = 256
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(256, return_sequences=True),
    Dropout(0.3),
    LSTM(128),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# 9. Compilation du mod√®le
optimizer = optimizers.Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])


# 11. Entra√Ænement
history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=64,
    validation_data=(X_val, y_val),
)

# 12. Pr√©paration des donn√©es de test
X_test = tokenizer.texts_to_sequences(test_data['Sentence'])
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
y_test = test_data['Label'].astype('int')

# 13. √âvaluation
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'\n‚úÖ Test Accuracy: {test_accuracy:.4f}')


Structure du train_data :
<class 'pandas.core.frame.DataFrame'>
Index: 30600 entries, 0 to 30613
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  30600 non-null  object
 1   Label     30600 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 717.2+ KB
None

Statistiques :
                                 Sentence         Label
count                               30600  30600.000000
unique                              30600           NaN
top     " or pg_sleep  (  __TIME__  )  --           NaN
freq                                    1           NaN
mean                                  NaN      0.370654
std                                   NaN      0.482988
min                                   NaN      0.000000
25%                                   NaN      0.000000
50%                                   NaN      0.000000
75%                                   NaN      1.000000
max                           

In [3]:
from sklearn.metrics import classification_report

# 1. R√©cup√©rer uniquement les lignes spam (Label = 1)
spam_only = test_data[test_data['Label'] == 1].copy()
print(f"\nüìå Nombre total de requ√™tes spam dans test_data : {len(spam_only)}")

# 2. Nettoyage si besoin
spam_only['Sentence'] = spam_only['Sentence'].apply(clean_text)

# 3. Tokenisation + Padding
X_spam = tokenizer.texts_to_sequences(spam_only['Sentence'])
X_spam = pad_sequences(X_spam, padding='post', maxlen=max_len)

# 4. Pr√©diction
spam_preds = model.predict(X_spam)
spam_preds_labels = (spam_preds > 0.5).astype(int)

# 5. Calcul du nombre de spams correctement d√©tect√©s
true_positives = np.sum(spam_preds_labels == 1)
total_spams = len(spam_only)
detection_rate = (true_positives / total_spams) * 100

# 6. Affichage
print(f"‚úÖ Spams correctement d√©tect√©s : {true_positives}")
print(f"üìä Taux de d√©tection : {detection_rate:.2f}%")



üìå Nombre total de requ√™tes spam dans test_data : 11424
‚úÖ Spams correctement d√©tect√©s : 11362
üìä Taux de d√©tection : 99.46%
