In [25]:
# Cell 1: Imports + Download dataset directly (no manual download needed)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")

print("Libraries loaded!")

Libraries loaded!


In [26]:
# Cell 2: PERFECTLY load your combined_data.csv (tested on your screenshot)

import pandas as pd

# This single line handles EVERYTHING for your file
df = pd.read_csv(
    "combined_data.csv",
    encoding="latin-1",          # required for this old dataset
    usecols=[0, 1],              # take only column 0 (v1 = label) and column 1 (v2 = text)
    names=['label', 'text'],     # rename them directly
    header=0,                    # first row has headers v1,v2,v3,... so skip it
    engine="python"
)

# Quick clean-up
df['text'] = df['text'].astype(str).str.strip()
df = df[df['text'].str.len() > 0]
df = df.dropna(subset=['text'])

print(f"LOADED {len(df)} messages successfully!")
print("\nFirst 5 rows:")
print(df.head())
print("\nLabel distribution before mapping:")
print(df['label'].value_counts())

LOADED 5572 messages successfully!

First 5 rows:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Label distribution before mapping:
label
ham     4825
spam     747
Name: count, dtype: int64


In [27]:
# Cell 3: Auto-detect columns and clean your combined_data.csv (FIXED)

# Possible column names in different Kaggle spam datasets
text_cols   = ['text', 'message', 'Message', 'v2', 'email', 'sms', 'Email Text', 'message_body']
label_cols  = ['label', 'category', 'Category', 'v1', 'spam', 'type', 'Label']

text_col  = None
label_col = None

for col in df.columns:
    col_lower = col.strip().lower()
    if any(name.lower() in col_lower for name in text_cols):
        text_col = col
    if any(name.lower() in col_lower for name in label_cols):
        label_col = col

if text_col is None or label_col is None:
    print("Could not auto-detect columns!")
    print("Available columns:", df.columns.tolist())
    print("Please rename them manually to 'text' and 'label'")
else:
    print(f"Detected → Text column: '{text_col}' | Label column: '{label_col}'")
    
    # Rename to standard names
    df = df.rename(columns={text_col: 'text', label_col: 'label'})
    
    # Keep only the two columns we need
    df = df[['text', 'label']].copy()
    
    # Clean text
    df['text'] = df['text'].astype(str).str.strip()
    df = df[df['text'].str.len() > 0]          # remove empty rows
    df = df.dropna(subset=['text'])           # safety
    
    # Clean and convert labels
    df['label'] = df['label'].astype(str).str.lower().str.strip()
    
    # Fixed mapping (this was the syntax error!)
    label_map = {
        'spam': 1, 'ham': 0,
        '1': 1, '0': 0,
        'yes': 1, 'no': 0,
        'true': 1, 'false': 0
    }
    df['label'] = df['label'].map(label_map)
    
    # Anything that didn't map becomes NaN → drop those rows
    df = df.dropna(subset=['label'])
    df['label'] = df['label'].astype(int)
    
    print(f"\nCLEANING DONE!")
    print(f"Final dataset shape : {df.shape}")
    print(f"Spam  messages      : {df['label'].sum()}")
    print(f"Ham   messages      : {len(df) - df['label'].sum()}")
    print("\nFirst 5 rows:")
    print(df.head())

Detected → Text column: 'text' | Label column: 'label'

CLEANING DONE!
Final dataset shape : (5572, 2)
Spam  messages      : 747
Ham   messages      : 4825

First 5 rows:
                                                text  label
0  Go until jurong point, crazy.. Available only ...      0
1                      Ok lar... Joking wif u oni...      0
2  Free entry in 2 a wkly comp to win FA Cup fina...      1
3  U dun say so early hor... U c already then say...      0
4  Nah I don't think he goes to usf, he lives aro...      0


In [28]:
# Cell 4: Train the model (takes ~10 seconds)

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Best pipeline for spam detection
model = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',
        max_features=50000,
        ngram_range=(1, 2),
        lowercase=True
    )),
    ('clf', LogisticRegression(max_iter=1000, C=1.0))
])

print("Training model...")
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

Training model...

Accuracy: 0.9605 (96.05%)

Classification Report:
              precision    recall  f1-score   support

         Ham       0.96      1.00      0.98       966
        Spam       0.99      0.71      0.83       149

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [29]:
# Cell 5: Save model + Quick test

joblib.dump(model, 'spam_classifier_model.pkl')
print("Model saved as 'spam_classifier_model.pkl'")

# Test with real messages
test_messages = [
    "Free entry in 2 a wkly comp to win FA Cup final tkts!",
    "Hey, are you free tonight for dinner?",
    "Congratulations! You've won $1000. Click here to claim",
    "Can we reschedule the meeting to 3pm?",
    "URGENT: Your account has been compromised. Login now"
]

print("\nLive Testing:")
for msg in test_messages:
    pred = model.predict([msg])[0]
    prob = model.predict_proba([msg])[0].max()
    result = "SPAM" if pred == 1 else "NOT SPAM"
    print(f"→ '{msg}'")
    print(f"   {result} ({prob:.1%} confidence)\n")

Model saved as 'spam_classifier_model.pkl'

Live Testing:
→ 'Free entry in 2 a wkly comp to win FA Cup final tkts!'
   NOT SPAM (63.1% confidence)

→ 'Hey, are you free tonight for dinner?'
   NOT SPAM (89.4% confidence)

→ 'Congratulations! You've won $1000. Click here to claim'
   SPAM (51.5% confidence)

→ 'Can we reschedule the meeting to 3pm?'
   NOT SPAM (94.3% confidence)

→ 'URGENT: Your account has been compromised. Login now'
   NOT SPAM (73.5% confidence)

