In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

In [None]:
data=pd.read_csv("SMSSpamCollection.csv" , sep="\t" , header=None)

In [None]:
data.columns=["label" , "body_text"]
data.head(5)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.label.value_counts()

In [None]:
data.label.value_counts(normalize=True).plot.pie()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


data['msg_len'] = data['body_text'].apply(lambda x: len(x.split()))

plt.figure(figsize=(10,5))
sns.histplot(data=data, x='msg_len', hue='label', bins=50, kde=True)
plt.title("Distribution of Message Length (Spam vs Ham)")
plt.xlabel("Message Length (words)")
plt.ylabel("Frequency")
plt.show()


3. Data Preparation-Pre-processing text data

2. تنظيف النصوص (Text Cleaning)

تحويل الحروف لصيغة موحدة (Lowercasing).

إزالة الأرقام (لو مش مهمة).

إزالة الرموز الخاصة (punctuations, emojis لو مش محتاجها).

إزالة المسافات الزائدة.

🔹 3. إزالة الكلمات عديمة القيمة (Stopwords Removal)

كلمات زي "the, is, in, من، على" غالبًا مش بتضيف معنى كبير.

🔹 4. تجزئة النص (Tokenization)

تقسم النص إلى كلمات أو جُمل.

مثال: "I love NLP" → ["I", "love", "NLP"].

🔹 5. تحويل الكلمات لصيغها الجذرية أو الأساسية

Stemming: إرجاع الكلمة لجذرها (e.g., "playing" → "play").

Lemmatization: إرجاع الكلمة لصيغتها الصحيحة (e.g., "better" → "good").

🔹 6. تحويل النص لتمثيل عددي (Feature Extraction)

عشان الموديل يفهم، النص لازم يتحول لأرقام:

Bag of Words (BoW)

TF-IDF

Word Embeddings (Word2Vec, GloVe, FastText)

Transformers Embeddings (BERT, GPT …)

🔹 7. التعامل مع النصوص الغير متوازنة أو طويلة

Padding/Truncating: عشان النصوص يكون لها نفس الطول.

Oversampling/Undersampling: لو عندك كلاس موزون بشكل سيء.

🔹 8. التقسيم (Train-Test Split)

تقسم البيانات لـ:

تدريب (Train)

تحقق (Validation)

اختبار (Test)

In [None]:
import string
string.punctuation

In [None]:

def remove_punct(text):
    
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['body_text_nopunc'] = data['body_text'].apply(lambda x: remove_punct(x))

data.head()

In [None]:

def remove_punct(text):
    
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['body_text_nopunc'] = data['body_text'].apply(lambda x: remove_punct(x.lower()))

data.head()

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
import re
#\W+ regex, indicates that it will split wherever it sees one or more non-word characters.
#So that'll split on white spaces, special characters, anything like that.

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens
data['body_text_tokenized'] = data['body_text_nopunc'].apply(lambda x: tokenize(x))

data.head()

3.4 Remove stopwords
Stopwords are common words that are present in the text but generally do not contribute to the meaning of a sentence. They hold almost no importance for the purposes of information retrieval and natural language processing. They can safely be ignored without sacrificing the meaning of the sentence. For example – ‘the’ and ‘a’.

Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

The NLTK package has a separate package of stop words that can be downloaded. NLTK has stop words more than 16 languages which can be downloaded and used. Once it is downloaded, it can be passed as an argument indicating it to ignore these words.

import nltk  from nltk.corpus 
import stopwords  set(stopwords.words('english')) 

In [None]:
import nltk
from nltk.corpus import stopwords
stopwords_En = nltk.corpus.stopwords.words('english')
stopwords_En

In [None]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords_En]
    return text

data['body_text_nostop'] = data['body_text_tokenized'].apply(lambda x: remove_stopwords(x))

data.head()

🔹 1. Stemming

هو عملية قص أو تقطيع الكلمة عشان نوصل لجذرها (Root)، لكن بشكل بسيط وسريع من غير ما يهتم بالمعنى اللغوي.

النتيجة ساعات بتكون كلمة غير موجودة فعليًا في القاموس.

✅ مثال:

"Studies" → "Studi"

"Studying" → "Study" أو أحيانًا "Studi"

"Better" → "Better" (ممكن يسيبها زي ما هي لأن الـ stemmer مش فاهم المعنى)

📌 أداة مشهورة: Porter Stemmer أو Snowball Stemmer في مكتبة NLTK.

ميزة: أسرع.

عيب: ممكن تطلع نتائج مش دقيقة أو كلمات مالهاش معنى.

🔹 2. Lemmatization

بتعتمد على القاموس (Dictionary) + القواعد اللغوية (Morphology).

بتديك الجذر الصحيح للكلمة اللي موجود فعلًا في القاموس.

بتاخد في اعتبارها نوع الكلمة (POS Tag: verb, noun, adjective, ...).

✅ مثال:

"Studies" → "Study" (اسم أو فعل)

"Studying" → "Study"

"Better" → "Good" (لأنها فاهمة إن "better" هي صيغة تفضيل من "good")

📌 أداة مشهورة: WordNet Lemmatizer في NLTK أو spacy Lemmatizer.

ميزة: أدق.

عيب: أبطأ من stemming لأنها محتاجة قاعدة بيانات لغوية.

🔑 الخلاصة:

لو عايز سرعة (زي في search engines) → استخدم Stemming.

لو عايز دقة ومعنى صحيح (زي NLP applications أو chatbots) → استخدم Lemmatization.

In [None]:
import nltk
ps = nltk.PorterStemmer()
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data['body_text_stemmed'] = data['body_text_nostop'].apply(lambda x: stemming(x))

data.head()

In [None]:
import nltk

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [None]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['body_text_lemmatized'] = data['body_text_nostop'].apply(lambda x: lemmatizing(x))

data.head(5)

lemmatizer=>ادق ف المعني 

In [None]:

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

def clean_text_lemma(text):

    text = "".join([char.lower() for char in text if char not in string.punctuation])

    tokens = word_tokenize(text)
    
    text = " ".join([lemmatizer.lemmatize(word) for word in tokens if word not in stopwords_En])
    return text
data=data[['label','body_text']]
data['cleaned_text'] = data['body_text'].apply(lambda x: clean_text_lemma(x))
data

In [None]:

### Create function to remove punctuation, tokenize, remove stopwords, and stem=اسرع

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])

    tokens = word_tokenize(text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords_En])
    return text
data=data[['label','body_text']]
data['cleaned_text'] = data['body_text'].apply(lambda x: clean_text(x))

In [None]:
data

🔑 الخلاصة:

لو مشروع بسيط (تصنيف Spam/Not Spam مثلاً) → استخدم TF-IDF.

لو مشروع عايز معنى أعمق (Sentiment Analysis, Chatbot, Translation) → استخدم Word Embeddings.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
#tfidf = TfidfVectorizer(min_df=1)
tfidf = TfidfVectorizer( ngram_range=(2,2))
features_tfidf = tfidf.fit_transform(data['cleaned_text'])
print(features_tfidf.shape)
print('Sparse Matrix :\n', features_tfidf)
features_tfidf = tfidf.fit_transform(data['cleaned_text'])

features_tfidf.columns = tfidf.get_feature_names_out()
features_tfidf

In [None]:
import numpy as np

import numpy as np
import pandas as pd

sample_features = features_tfidf[:, :30].toarray()

features_df = pd.DataFrame(sample_features, columns=tfidf.get_feature_names_out()[:30])


corr = np.corrcoef(features_df.T)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="coolwarm", 
            xticklabels=tfidf.get_feature_names_out()[:30], 
            yticklabels=tfidf.get_feature_names_out()[:30])
plt.title("Correlation Heatmap of Top 30 TF-IDF Features")
plt.show()


In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
X = tfidf.fit_transform(data['cleaned_text'])
y = data['label']

In [None]:
from sklearn.model_selection import train_test_split

X = features_tfidf
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.naive_bayes import MultinomialNB

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "SVM": LinearSVC()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))


In [None]:
# Accuracy Comparison
plt.figure(figsize=(6,4))
sns.barplot(x=list(results.keys()), y=list(results.values()), palette="viridis")
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.show()


In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

cm = confusion_matrix(y_test, y_pred_nb)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Naive Bayes")
plt.show()

In [None]:
spam_words = " ".join(data[data['label']=='spam']['cleaned_text'])
ham_words = " ".join(data[data['label']=='ham']['cleaned_text'])

spam_wc = WordCloud(width=800, height=400, background_color='black', colormap='Reds').generate(spam_words)
ham_wc = WordCloud(width=800, height=400, background_color='white', colormap='Blues').generate(ham_words)

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.imshow(spam_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Spam WordCloud")

plt.subplot(1,2,2)
plt.imshow(ham_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Ham WordCloud")
plt.show()


In [None]:
y_pred_prob = nb.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test.map({'ham':0,'spam':1}), y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1], color='navy', lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Naive Bayes")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

sv = LinearSVC()
sv.fit(X_train, y_train)


y_scores = sv.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test.map({'ham':0,'spam':1}), y_scores)
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label=f"SVM (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM")
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

sv1 = LogisticRegression(max_iter=200)
sv1.fit(X_train, y_train)


y_scores = sv1.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test.map({'ham':0,'spam':1}), y_scores)
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label=f"SVM (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - logisticRegression")
plt.legend()
plt.show()


In [None]:
import joblib
nb = MultinomialNB()
nb.fit(X_train, y_train)
joblib.dump(nb, "spam_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


