In [1]:
# ---------------------------------------------------------------
# Logistic Regression Spam Classifier
# Dataset: SPAM text message 20170820 - Data.csv
# ---------------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer

# ---------------------------------------------------------------
# 1. LOAD DATASET
# ---------------------------------------------------------------
# File uploaded in this environment
DATA_PATH = "/content/sample_data/Spam_Message.csv"

df = pd.read_csv(DATA_PATH, encoding="latin1")

# Clean column names (dataset contains unnamed columns)
df = df.iloc[:, :2]
df.columns = ["label", "message"]

# ---------------------------------------------------------------
# 2. BASIC PREPROCESSING
# (No NLP or sentiment analysis libraries used)
# ---------------------------------------------------------------

# Convert Spam/Ham labels to numeric
df["label"] = df["label"].map({"spam": 1, "ham": 0})

# --- Vectorize text using basic Bag-of-Words (allowed, not NLP) ---
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["message"])
y = df["label"]

# ---------------------------------------------------------------
# 3. TRAINâ€“TEST SPLIT
# ---------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Training samples :", X_train.shape[0])
print("Testing samples  :", X_test.shape[0])

# ---------------------------------------------------------------
# 4. TRAIN LOGISTIC REGRESSION MODEL
# ---------------------------------------------------------------
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

# ---------------------------------------------------------------
# 5. EVALUATION
# ---------------------------------------------------------------
y_pred = model.predict(X_test)

print("\n------------------------------")
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("------------------------------")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ---------------------------------------------------------------
# 6. TEST ON SAMPLE INPUT
# ---------------------------------------------------------------
sample_messages = [
    "Congratulations! You won a free voucher",
    "Let's meet for lunch tomorrow",
    "URGENT! Your account has been suspended",
]

sample_features = vectorizer.transform(sample_messages)
sample_pred = model.predict(sample_features)

print("\nSample Predictions:")
for msg, pred in zip(sample_messages, sample_pred):
    print(f"Message: {msg} --> {'SPAM' if pred == 1 else 'NOT SPAM'}")


Training samples : 4457
Testing samples  : 1115

------------------------------
Model Accuracy: 0.9802690582959641
------------------------------

Confusion Matrix:
[[966   0]
 [ 22 127]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Sample Predictions:
Message: Congratulations! You won a free voucher --> NOT SPAM
Message: Let's meet for lunch tomorrow --> NOT SPAM
Message: URGENT! Your account has been suspended --> NOT SPAM
