In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

df= pd.read_csv('messages.csv')
print(df.shape)
print(df['Category'].value_counts())
df.head()

(5572, 2)
Category
ham     4825
spam     747
Name: count, dtype: int64


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Improved clean_text function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers (optional, helps focus on words)
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace and tabs
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

    # Apply cleaning
df['Message'] = df['Message'].apply(clean_text)
df.head()


Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [4]:

# Convert labels: ham=0, spam=1
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

# Split data: 80% train, 20% test
X = df['Message']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF feature extractor
vectorizer = TfidfVectorizer(ngram_range=(1,3),max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# Train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

# Evaluate model
predictions1 = nb.predict(X_test_vec)
print("Naive-Bayes Multinomial")
accuracy = accuracy_score(y_test, predictions1)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test,predictions1, target_names=['ham', 'spam']))



Naive-Bayes Multinomial
Accuracy: 97.76%

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.85      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import time


# Hyperparameter tuning of Logistic with GridSearchCV
param_grid = {'C': [30]}
lr = GridSearchCV(LogisticRegression(class_weight='balanced', max_iter=1000), param_grid, cv=5, scoring='f1_macro')
t=time.time()
lr.fit(X_train_vec, y_train)
print(f"Best C: {lr.best_params_['C']}")

# Evaluate model
predictions2 = lr.predict(X_test_vec)
print("time:",time.time()-t)

accuracy = accuracy_score(y_test,predictions2)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, predictions2, target_names=['ham', 'spam']))

Best C: 30
time: 0.18467307090759277
Accuracy: 97.58%

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99       966
        spam       0.90      0.93      0.91       149

    accuracy                           0.98      1115
   macro avg       0.94      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [6]:
final_preds = []
for lr, nb in zip(predictions1, predictions2):
    if lr == 0:   # Logistic says ham
        final_preds.append(0)
    else:         # Logistic says spam → check with NB
        if nb == 0:
            final_preds.append(0)  # NB rescues ham
        else:
            final_preds.append(1)  # Both agree spam

print("\nClassification Report:")
print(classification_report(y_test, final_preds, target_names=['ham', 'spam']))


Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [20]:
!git status


On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [21]:
!git remote add origin https://github.com/abrajput22/spam-message-classification.git
!git branch -M main


error: remote origin already exists.


In [28]:
!git remote -v


origin	https://github.com/abrajput22/spam-message-classification.git (fetch)
origin	https://github.com/abrajput22/spam-message-classification.git (push)
