In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # model evaluation metrics
import nltk  # natural language toolkit
from nltk.corpus import stopwords #provide set of common stopwords
from nltk.stem import WordNetLemmatizer #using the WordNet lexical database to determine the lemma of a word
import re  # regular experession, for search & pattern matching, manupulation of strings

In [2]:
# Download NLTK resources
nltk.download('stopwords') # Downloads the "stopwords" dataset from NLTK for use in text preprocessing
nltk.download('wordnet') # Downloads the "wordnet" dataset from NLTK for use in lemmatization and lexical analysis.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Load the dataset (Update the file path to the actual Kaggle dataset file)
file_path = "spam.csv"  # Replace with the actual dataset file
data = pd.read_csv(file_path, encoding='latin-1') # Reads a CSV file into a pandas DataFrame, using 'latin-1' encoding to interprete special characters correctly.


In [5]:
# Inspect the dataset
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [6]:
# Rename columns for clarity
data = data.rename(columns={"v1": "label", "v2": "message"})
data = data[["label", "message"]]

In [7]:
# Encode the labels (ham = 0, spam = 1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [8]:
# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize- break down text in to smaller unit-tokens- these can be words/characters/phrases depending on use case.
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]
    return ' '.join(text)

In [9]:
# Apply preprocessing
import nltk
nltk.download('omw-1.4') # Downloads extra data for WordNet to support multiple languages in lemmatization.(Open Multilingual WordNet version 1.4)
data['message'] = data['message'].apply(preprocess_text)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [10]:

X = data['message']  # Split the dataset into training and testing sets
y = data['label']

vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42, stratify=y)   # Splitting data into train and test sets

In [11]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [12]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


nb = MultinomialNB()                                  # Multinomial Naive Bayes
nb.fit(X_train_resampled, y_train_resampled)
y_pred_nb = nb.predict(X_test)

acc_nb = accuracy_score(y_test, y_pred_nb)        # Metrics
report_nb = classification_report(y_test, y_pred_nb)
cm_nb = confusion_matrix(y_test, y_pred_nb)

print("Multinomial Naive Bayes:")
print(f"Accuracy: {acc_nb}")
print(f"Classification Report:\n{report_nb}")
print(f"Confusion Matrix:\n{cm_nb}")


Multinomial Naive Bayes:
Accuracy: 0.9730941704035875
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       966
           1       0.87      0.93      0.90       149

    accuracy                           0.97      1115
   macro avg       0.93      0.96      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
[[946  20]
 [ 10 139]]


In [13]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression(max_iter=1000)            # Logistic Regression
lr.fit(X_train_resampled, y_train_resampled)
y_pred_lr = lr.predict(X_test)


acc_lr = accuracy_score(y_test, y_pred_lr)              # Metrics
report_lr = classification_report(y_test, y_pred_lr)
cm_lr = confusion_matrix(y_test, y_pred_lr)

print("Logistic Regression:")
print(f"Accuracy: {acc_lr}")
print(f"Classification Report:\n{report_lr}")
print(f"Confusion Matrix:\n{cm_lr}")


Logistic Regression:
Accuracy: 0.9147982062780269
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.92      0.95       966
           1       0.62      0.91      0.74       149

    accuracy                           0.91      1115
   macro avg       0.80      0.91      0.85      1115
weighted avg       0.94      0.91      0.92      1115

Confusion Matrix:
[[884  82]
 [ 13 136]]


In [14]:
from sklearn.svm import SVC


svm = SVC(probability=True)                            # Support Vector Machine
svm.fit(X_train_resampled, y_train_resampled)
y_pred_svm = svm.predict(X_test)


acc_svm = accuracy_score(y_test, y_pred_svm)                   # Metrics
report_svm = classification_report(y_test, y_pred_svm)
cm_svm = confusion_matrix(y_test, y_pred_svm)

print("Support Vector Machine:")
print(f"Accuracy: {acc_svm}")
print(f"Classification Report:\n{report_svm}")
print(f"Confusion Matrix:\n{cm_svm}")


Support Vector Machine:
Accuracy: 0.8753363228699551
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.93       966
           1       0.53      0.70      0.60       149

    accuracy                           0.88      1115
   macro avg       0.74      0.80      0.76      1115
weighted avg       0.89      0.88      0.88      1115

Confusion Matrix:
[[872  94]
 [ 45 104]]


In [15]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest
rf.fit(X_train_resampled, y_train_resampled)
y_pred_rf = rf.predict(X_test)


acc_rf = accuracy_score(y_test, y_pred_rf)                      # Metrics
report_rf = classification_report(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

print("Random Forest:")
print(f"Accuracy: {acc_rf}")
print(f"Classification Report:\n{report_rf}")
print(f"Confusion Matrix:\n{cm_rf}")


Random Forest:
Accuracy: 0.8896860986547085
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.88      0.93       966
           1       0.55      0.93      0.69       149

    accuracy                           0.89      1115
   macro avg       0.77      0.91      0.81      1115
weighted avg       0.93      0.89      0.90      1115

Confusion Matrix:
[[854 112]
 [ 11 138]]


In [16]:
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(random_state=42)  # Decision Tree
dt.fit(X_train_resampled, y_train_resampled)
y_pred_dt = dt.predict(X_test)


acc_dt = accuracy_score(y_test, y_pred_dt)               # Metrics
report_dt = classification_report(y_test, y_pred_dt)
cm_dt = confusion_matrix(y_test, y_pred_dt)

print("Decision Tree:")
print(f"Accuracy: {acc_dt}")
print(f"Classification Report:\n{report_dt}")
print(f"Confusion Matrix:\n{cm_dt}")



Decision Tree:
Accuracy: 0.852914798206278
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.84      0.91       966
           1       0.47      0.91      0.62       149

    accuracy                           0.85      1115
   macro avg       0.73      0.88      0.77      1115
weighted avg       0.92      0.85      0.87      1115

Confusion Matrix:
[[816 150]
 [ 14 135]]


In [17]:

results = {                                                  # Store results for each model based on the metrics
    'Multinomial Naive Bayes': {
        'accuracy': 0.9730941704035875,
        'precision_spam': 0.87,
        'recall_spam': 0.93,
        'f1_score_spam': 0.90,
    },
    'Logistic Regression': {
        'accuracy': 0.9147982062780269,
        'precision_spam': 0.62,
        'recall_spam': 0.91,
        'f1_score_spam': 0.74,
    },
    'Support Vector Machine': {
        'accuracy': 0.8753363228699551,
        'precision_spam': 0.53,
        'recall_spam': 0.70,
        'f1_score_spam': 0.60,
    },
    'Random Forest': {
        'accuracy': 0.8896860986547085,
        'precision_spam': 0.55,
        'recall_spam': 0.93,
        'f1_score_spam': 0.69,
    },
    'Decision Tree': {
        'accuracy': 0.852914798206278,
        'precision_spam': 0.47,
        'recall_spam': 0.91,
        'f1_score_spam': 0.62,
    },
    'Old Model': {
        'accuracy': 0.9659192825112107,
        'precision_spam': 1.00,
        'recall_spam': 0.75,
        'f1_score_spam': 0.85,
    }
}



In [18]:

def select_best_model(results):  # Define a custom function to select the best model based on priority of metrics

    metrics_priority = ['recall_spam', 'precision_spam', 'f1_score_spam', 'accuracy'] # Set priority for metrics (you can adjust this based on your preference)
    best_model = None   # Initialize variables to track the best model and its score
    best_score = -float('inf')

    for model_name, metrics in results.items():
        score = 0

        for metric in metrics_priority:  # Calculate the score based on prioritized metrics
            score += metrics[metric]


        if score > best_score:
            best_score = score
            best_model = model_name  # Update the best model if the current model has a higher score

    return best_model


best_model = select_best_model(results) # Get the best model based on evaluation metrics


print(f"The best model is: {best_model}") # Print the name of the best model

The best model is: Multinomial Naive Bayes


In [19]:
# Display the metrics for the best model
print("\nMetrics for the best model:")
print(f"Accuracy: {results[best_model]['accuracy']}")
print(f"Precision (Spam): {results[best_model]['precision_spam']}")
print(f"Recall (Spam): {results[best_model]['recall_spam']}")
print(f"F1-Score (Spam): {results[best_model]['f1_score_spam']}")


Metrics for the best model:
Accuracy: 0.9730941704035875
Precision (Spam): 0.87
Recall (Spam): 0.93
F1-Score (Spam): 0.9


In [20]:

def predict_message(message):  # Function to preprocess the new message and predict if it is spam or ham

    preprocessed_message = preprocess_text(message)  # Preprocess the message (remove non-word characters, lowercase, remove stopwords, and lemmatize)


    message_vectorized = vectorizer.transform([preprocessed_message])  # Vectorize the preprocessed message using the trained vectorizer


    prediction = nb.predict(message_vectorized)  # Predict the label using the trained Naive Bayes model


    return "Spam" if prediction == 1 else "Ham"  # Return the prediction (0 = Ham, 1 = Spam)


message1 = "Congratulations! You have won a $1000 gift card. Please reply with your details."   # Example 1
print(f"Message: {message1} - Prediction: {predict_message(message1)}")
message2 = "let's discuss over Teams about project."                               # Example 2
print(f"Message: {message2} - Prediction: {predict_message(message2)}")


Message: Congratulations! You have won a $1000 gift card. Please reply with your details. - Prediction: Spam
Message: let's discuss over Teams about project. - Prediction: Ham
