In [8]:
# =========================================
# Spam Detection: Combine CSVs & Preprocess
# =========================================

# Step 1: Import libraries
import pandas as pd
import glob
from zipfile import ZipFile
import string

# =========================================
# Step 2: Extract ZIP file
# =========================================
zip_path = "/content/archive (2).zip"  # path to your uploaded zip
extract_folder = "/content/spam_data"  # folder to extract CSVs

with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print("ZIP extracted successfully!")

# =========================================
# Step 3: Read and combine all CSVs
# =========================================
all_files = glob.glob(extract_folder + "/*.csv")
df_list = []

for file in all_files:
    df = pd.read_csv(file, encoding='latin-1')

    # Standardize text column
    if 'text_combined' in df.columns:
        df['text'] = df['text_combined']
    elif 'subject' in df.columns and 'body' in df.columns:
        df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')
    elif 'body' in df.columns:
        df['text'] = df['body']
    else:
        continue  # skip if no usable text column

    # Standardize label column
    if 'label' in df.columns:
        # Convert all labels to string, strip spaces, lowercase
        df['label'] = df['label'].astype(str).str.strip().str.lower()
        # Map numeric or other variants to 'spam'/'ham'
        df['label'] = df['label'].replace({
            '1': 'spam', '0': 'ham',
            's': 'spam', 'h': 'ham'
        })
    else:
        continue  # skip if no label

    # Keep only rows with 'spam' or 'ham'
    df = df[df['label'].isin(['spam', 'ham'])]
    df = df[['label', 'text']]  # keep only relevant columns
    df_list.append(df)

# Combine all CSVs
data = pd.concat(df_list, ignore_index=True)
print("Combined dataset shape after filtering:", data.shape)

# =========================================
# Step 4: Drop missing text
# =========================================
data = data.dropna(subset=['text']).copy()
print("After dropping missing text:", data.shape)

# =========================================
# Step 5: Text preprocessing
# =========================================
def preprocess_text(text):
    text = str(text).lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    return text

data['text'] = data['text'].apply(preprocess_text)

# =========================================
# Step 6: Map labels to numeric for ML
# =========================================
data['label'] = data['label'].map({'ham':0, 'spam':1})

# =========================================
# Step 7: Check sample
# =========================================
print("Sample preprocessed data:")
print(data.head())
print("\nTotal messages in dataset:", len(data))
print("Label distribution:\n", data['label'].value_counts())


ZIP extracted successfully!
Combined dataset shape after filtering: (164972, 2)
After dropping missing text: (164972, 2)
Sample preprocessed data:
   label                                               text
0      1  dont delete this message  folder internal data...
1      1  verify your account business with  \t\t\t\t\t\...
2      1  helpdesk mailbox alert your two incoming mails...
3      1  itservice help desk password will expire in 3 ...
4      1  final usaa reminder  update your account now t...

Total messages in dataset: 164972
Label distribution:
 label
1    85782
0    79190
Name: count, dtype: int64


In [9]:
# =========================================
# Step 8: Split dataset into train/test
# =========================================
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))

# =========================================
# Step 9: TF-IDF Vectorization
# =========================================
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # top 5000 words
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# =========================================
# Step 10: Train Multinomial Naive Bayes
# =========================================
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# =========================================
# Step 11: Evaluate Model
# =========================================
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = nb_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# =========================================
# Step 12: Predict New Messages
# =========================================
new_messages = [
    "Congratulations! You've won a free ticket. Call now!",
    "Hey, are we meeting today for lunch?"
]

# Preprocess
new_messages_clean = [text.lower().translate(str.maketrans('', '', string.punctuation)) for text in new_messages]
new_messages_tfidf = vectorizer.transform(new_messages_clean)
predictions = nb_model.predict(new_messages_tfidf)

for msg, pred in zip(new_messages, predictions):
    print(f"\nMessage: {msg}")
    print("Predicted:", "Spam" if pred==1 else "Not Spam")


Training samples: 131977
Testing samples: 32995
Accuracy: 0.9613577814820428

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96     15835
           1       0.98      0.95      0.96     17160

    accuracy                           0.96     32995
   macro avg       0.96      0.96      0.96     32995
weighted avg       0.96      0.96      0.96     32995

Confusion Matrix:
 [[15472   363]
 [  912 16248]]

Message: Congratulations! You've won a free ticket. Call now!
Predicted: Spam

Message: Hey, are we meeting today for lunch?
Predicted: Not Spam
