# Spam Email Classifier

A machine learning model to classify emails as spam or not spam using the UCI Spambase dataset.

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

## Load Data

In [None]:
df = pd.read_csv('../data/spambase.data', header=None)
df.head()

## Preprocess Data

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Simulate 'email text' by joining features as strings
text_data = X.astype(str).agg(' '.join, axis=1)

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_tfidf = vectorizer.fit_transform(text_data)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

## Train Model

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

## Evaluate Model

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

## Save Model

In [None]:
joblib.dump(model, '../models/spam_classifier_model.pkl')
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')

## Predict Example

In [None]:
# Replace with 57 real feature values as strings
dummy_email = ['0']*57
sample = ' '.join(dummy_email)
vector = vectorizer.transform([sample])
print('Prediction:', 'Spam' if model.predict(vector)[0] == 1 else 'Ham')