In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

In [None]:
data=pd.read_csv("SMSSpamCollection.csv" , sep="\t" , header=None)

In [None]:
data.columns=["label" , "body_text"]
data.head(5)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.label.value_counts()

In [None]:
data.label.value_counts(normalize=True).plot.pie()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


data['msg_len'] = data['body_text'].apply(lambda x: len(x.split()))

plt.figure(figsize=(10,5))
sns.histplot(data=data, x='msg_len', hue='label', bins=50, kde=True)
plt.title("Distribution of Message Length (Spam vs Ham)")
plt.xlabel("Message Length (words)")
plt.ylabel("Frequency")
plt.show()


In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# تحميل الموارد (تعملها مرة واحدة بس)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
ps = nltk.PorterStemmer()
stopwords_En = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text, method="lemma"):
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords_En]
    if method == "stem":
        tokens = [ps.stem(word) for word in tokens]
    else:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)
data['cleaned_text'] = data['body_text'].apply(lambda x: preprocess_text(x))
data

🔑 الخلاصة:

لو مشروع بسيط (تصنيف Spam/Not Spam مثلاً) → استخدم TF-IDF.

لو مشروع عايز معنى أعمق (Sentiment Analysis, Chatbot, Translation) → استخدم Word Embeddings.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
#tfidf = TfidfVectorizer(min_df=1)
tfidf = TfidfVectorizer( ngram_range=(2,2))
features_tfidf = tfidf.fit_transform(data['cleaned_text'])
print(features_tfidf.shape)
print('Sparse Matrix :\n', features_tfidf)
features_tfidf = tfidf.fit_transform(data['cleaned_text'])

features_tfidf.columns = tfidf.get_feature_names_out()
features_tfidf

In [None]:
import numpy as np

import numpy as np
import pandas as pd

sample_features = features_tfidf[:, :30].toarray()

features_df = pd.DataFrame(sample_features, columns=tfidf.get_feature_names_out()[:30])


corr = np.corrcoef(features_df.T)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="coolwarm", 
            xticklabels=tfidf.get_feature_names_out()[:30], 
            yticklabels=tfidf.get_feature_names_out()[:30])
plt.title("Correlation Heatmap of Top 30 TF-IDF Features")
plt.show()


In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
X = tfidf.fit_transform(data['cleaned_text'])
y = data['label']

In [None]:
from sklearn.model_selection import train_test_split

X = features_tfidf
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.naive_bayes import MultinomialNB

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "SVM": LinearSVC()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))


In [None]:
# Accuracy Comparison
plt.figure(figsize=(6,4))
sns.barplot(x=list(results.keys()), y=list(results.values()), palette="viridis")
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.show()


In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

cm = confusion_matrix(y_test, y_pred_nb)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Naive Bayes")
plt.show()

In [None]:
spam_words = " ".join(data[data['label']=='spam']['cleaned_text'])
ham_words = " ".join(data[data['label']=='ham']['cleaned_text'])

spam_wc = WordCloud(width=800, height=400, background_color='black', colormap='Reds').generate(spam_words)
ham_wc = WordCloud(width=800, height=400, background_color='white', colormap='Blues').generate(ham_words)

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.imshow(spam_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Spam WordCloud")

plt.subplot(1,2,2)
plt.imshow(ham_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Ham WordCloud")
plt.show()


In [None]:
y_pred_prob = nb.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test.map({'ham':0,'spam':1}), y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1], color='navy', lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Naive Bayes")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

sv = LinearSVC()
sv.fit(X_train, y_train)


y_scores = sv.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test.map({'ham':0,'spam':1}), y_scores)
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label=f"SVM (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM")
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

sv1 = LogisticRegression(max_iter=200)
sv1.fit(X_train, y_train)


y_scores = sv1.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test.map({'ham':0,'spam':1}), y_scores)
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, label=f"SVM (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - logisticRegression")
plt.legend()
plt.show()


In [None]:
import joblib
nb = MultinomialNB()
nb.fit(X_train, y_train)
joblib.dump(nb, "spam_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


