<a href="https://colab.research.google.com/github/amanpawade/Spam_SMS/blob/main/SMS_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mounting to Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


 Load and preprocess data


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib


def load_data(filepath):

    data = pd.read_csv('/content/drive/MyDrive/Spam_SMS_Detection/spam.csv', encoding='latin-1')


    print("\nColumns in CSV:", data.columns)


    if 'label' in data.columns and 'message' in data.columns:
        data = data[['label', 'message']]
    elif 'v1' in data.columns and 'v2' in data.columns:
        data = data[['v1', 'v2']]
        data.columns = ['label', 'message']
    else:
        raise ValueError("Unexpected column names. Check the CSV file.")


    data['label'] = data['label'].map({'ham': 0, 'spam': 1})

    return data

Text vectorization

In [24]:
def vectorize_text(X_train, X_test):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=3000, ngram_range=(1, 2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    return X_train_tfidf, X_test_tfidf, vectorizer

Train and evaluate model

In [25]:
def train_and_evaluate(X_train, y_train, X_test, y_test):


    models = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
    }

    best_model = None
    best_accuracy = 0

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n{name} Model Performance:")
        print(f"Accuracy: {accuracy:.4f}")
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

    return best_model

Saving and loading model

In [26]:
def save_model(model, vectorizer, model_path="spam_model.pkl", vec_path="vectorizer.pkl"):

    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vec_path)
    print("\nModel and Vectorizer saved successfully!")

def load_model(model_path="spam_model.pkl", vec_path="vectorizer.pkl"):

    model = joblib.load(model_path)
    vectorizer = joblib.load(vec_path)
    return model, vectorizer

 Predict function

In [27]:
def predict_sms(message, model, vectorizer):

    message_tfidf = vectorizer.transform([message])
    prediction = model.predict(message_tfidf)
    return "Spam" if prediction[0] == 1 else "Not Spam"

Main Execution

In [28]:
if __name__ == "__main__":

    file_path = "/content/drive/MyDrive/Spam_SMS_Detection/spam.csv"


    data = load_data(file_path)


    X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)


    X_train_tfidf, X_test_tfidf, vectorizer = vectorize_text(X_train, X_test)


    best_model = train_and_evaluate(X_train_tfidf, y_train, X_test_tfidf, y_test)


    save_model(best_model, vectorizer)


    model, vectorizer = load_model()




Columns in CSV: Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

Naive Bayes Model Performance:
Accuracy: 0.9731
[[965   0]
 [ 30 120]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.80      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Random Forest Model Performance:
Accuracy: 0.9776
[[960   5]
 [ 20 130]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.87      0.91       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Model and Vectorizer saved successfully!


Taking User Input to detect spam / not spam messages

In [29]:
while True:
    user_message = input("\nEnter an SMS to classify (or type 'exit' to quit): ").strip()

    if user_message.lower() == 'exit':
        print("Exiting...")
        break

    prediction = predict_sms(user_message, model, vectorizer)
    print(f"Prediction: {prediction}")


Enter an SMS to classify (or type 'exit' to quit): Hi 
Prediction: Not Spam

Enter an SMS to classify (or type 'exit' to quit): Congratulations! You've won a $1000 gift card. Click here to claim your prize:
Prediction: Spam

Enter an SMS to classify (or type 'exit' to quit): exit
Exiting...
