In [57]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [58]:
df = pd.read_csv("spam.csv", encoding="latin1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [60]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [61]:
df.duplicated().sum()

np.int64(403)

In [62]:
df.isnull().sum()

Unnamed: 0,0
v1,0
v2,0
Unnamed: 2,5522
Unnamed: 3,5560
Unnamed: 4,5566


##**Choose Columns**

In [63]:
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

##**Preprocessing**

**Remove Duplicated**

In [64]:
df = df.drop_duplicates(subset=['text'])

In [65]:
df.duplicated().sum()

np.int64(0)

In [66]:
df.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [67]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4516
spam,653


##**Label Encoding**

In [68]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label']) # ham(not spam)>>0 , spam>>1

df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


##**Text Cleaning**

In [69]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)   #remove html tags
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

#Remove Stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['text'] = df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##**Split Dataset**

In [70]:
from sklearn.model_selection import train_test_split

X = df['text'].astype(str).values
y = df['label'].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 4135, Validation: 517, Test: 517


##**Tokenization**

In [71]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 5000

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

##**Padding**

In [72]:
max_len = 100

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')


##**Build RNN Model**

In [73]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam

# ---------- Hyperparameters ----------
MAX_NUM_WORDS = 5000      # حجم القاموس زي اللي عملتيه
MAX_SEQ_LEN = 100         # طول التسلسل بعد البادينج
EMBEDDING_DIM = 100       # أبعاد ال embedding
LSTM_UNITS = 64
BATCH_SIZE = 64
EPOCHS = 7
# ------------------------------------

# ---------- 4) بناء نموذج RNN (Bidirectional LSTM) ----------
model = Sequential([
    Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_SEQ_LEN),
    Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')   # ثنائي التصنيف
])

model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()




**bold text**

In [74]:
checkpoint = ModelCheckpoint("best_sms_lstm.h5", monitor='val_accuracy', save_best_only=True, verbose=1)
earlystop = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

##**Train Model**

In [75]:
history = model.fit(
    X_train_pad, y_train,               # استخدمي X_train_pad بعد البادينج
    validation_data=(X_val_pad, y_val), # استخدمي X_val_pad
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, earlystop],
    verbose=2
)

Epoch 1/7

Epoch 1: val_accuracy improved from -inf to 0.97099, saving model to best_sms_lstm.h5




65/65 - 20s - 301ms/step - accuracy: 0.9016 - loss: 0.2903 - val_accuracy: 0.9710 - val_loss: 0.1131
Epoch 2/7

Epoch 2: val_accuracy improved from 0.97099 to 0.97872, saving model to best_sms_lstm.h5




65/65 - 15s - 230ms/step - accuracy: 0.9819 - loss: 0.0694 - val_accuracy: 0.9787 - val_loss: 0.0612
Epoch 3/7

Epoch 3: val_accuracy did not improve from 0.97872
65/65 - 27s - 417ms/step - accuracy: 0.9927 - loss: 0.0292 - val_accuracy: 0.9787 - val_loss: 0.0683
Epoch 4/7

Epoch 4: val_accuracy improved from 0.97872 to 0.98066, saving model to best_sms_lstm.h5




65/65 - 23s - 354ms/step - accuracy: 0.9961 - loss: 0.0166 - val_accuracy: 0.9807 - val_loss: 0.0611
Epoch 5/7

Epoch 5: val_accuracy improved from 0.98066 to 0.98259, saving model to best_sms_lstm.h5




65/65 - 33s - 508ms/step - accuracy: 0.9978 - loss: 0.0113 - val_accuracy: 0.9826 - val_loss: 0.0587
Epoch 6/7

Epoch 6: val_accuracy improved from 0.98259 to 0.98453, saving model to best_sms_lstm.h5




65/65 - 16s - 242ms/step - accuracy: 0.9990 - loss: 0.0051 - val_accuracy: 0.9845 - val_loss: 0.0662
Epoch 7/7

Epoch 7: val_accuracy did not improve from 0.98453
65/65 - 19s - 285ms/step - accuracy: 0.9978 - loss: 0.0058 - val_accuracy: 0.9826 - val_loss: 0.0773


##**Model Evaluation on Test set**

In [76]:
# ---------- التنبؤ على Test set ----------
y_pred_prob = model.predict(X_test_pad)          # احتمالات بين 0 و 1
y_pred = (y_pred_prob > 0.5).astype(int)         # تحويلها لتصنيف ثنائي (0=ham, 1=spam)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 97ms/step


In [77]:
from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Confusion Matrix:
[[451   1]
 [  5  60]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       452
           1       0.98      0.92      0.95        65

    accuracy                           0.99       517
   macro avg       0.99      0.96      0.97       517
weighted avg       0.99      0.99      0.99       517



In [78]:
# تقييم الموديل على Test set
test_loss, test_acc = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.9884


In [80]:
new_text = "Free entry in 2 a wkly comp to win FA Cup!"
new_text_clean = clean_text(new_text)

new_seq = tokenizer.texts_to_sequences([new_text_clean])

new_pad = pad_sequences(new_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

pred_prob = model.predict(new_pad)
pred_label = (pred_prob > 0.5).astype(int)

print(f"Predicted Probability: {pred_prob[0][0]:.4f}")
print(f"Predicted Label: {'spam' if pred_label[0][0]==1 else 'ham'}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
Predicted Probability: 0.9951
Predicted Label: spam
