In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Charger les données
df = pd.read_csv("sms_agressifs_dataset.csv")  # colonnes : 'text', 'label'

# Paramètres
max_words = 5000
max_len = 50

# Tokenisation
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Split
X_train, X_test, y_train, y_test = train_test_split(padded, df["label"], test_size=0.2, random_state=42)


In [2]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [3]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16)


Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 53ms/step - accuracy: 0.6570 - loss: 0.6370 - val_accuracy: 1.0000 - val_loss: 0.0538
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 1.0000 - loss: 0.0285 - val_accuracy: 1.0000 - val_loss: 0.0022
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 5.0798e-04
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - accuracy: 1.0000 - loss: 3.1379e-04 - val_accuracy: 1.0000 - val_loss: 2.5367e-04
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 1.0000 - loss: 1.7204e-04 - val_accuracy: 1.0000 - val_loss: 1.4949e-04


In [6]:
def predire_sms(sms):
    seq = tokenizer.texts_to_sequences([sms])
    pad = pad_sequences(seq, maxlen=max_len, padding='post')
    proba = model.predict(pad)[0][0]
    print("Agressif 😠" if proba >= 0.5 else "Non agressif 😊")

predire_sms("Tu es une femme merveilleuse.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Non agressif 😊


In [5]:
!python --version
import tensorflow as tf
print(tf.__version__)

Python 3.11.13
2.18.0


In [7]:
# Sauvegarder le modèle
model.save("modele_lstm_sms.h5")

# Sauvegarder le tokenizer
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)




In [8]:
from google.colab import files
files.download("modele_lstm_sms.h5")
files.download("tokenizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>