In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline

# Assuming you have a CSV file with 'text' column and 'spam' column indicating spam or ham
def read_email_data(file_path):
    df = pd.read_csv(file_path)
    return df

def train_spam_filter(data_frame):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        data_frame['text'], data_frame['spam'], test_size=0.2, random_state=42
    )

    # Create a pipeline with a CountVectorizer and a Multinomial Naive Bayes classifier
    model = make_pipeline(CountVectorizer(), MultinomialNB())

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2%}')

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return model

def predict_single_email(model, email_text):
    # Make a prediction for a single email
    prediction = model.predict([email_text])
    return prediction[0]

def main():

    file_path = 'emails.csv'

    # Read the email data
    email_data = read_email_data(file_path)

    # Train the spam filter
    spam_filter_model = train_spam_filter(email_data)

    # Example of predicting a single email
    example_email_text = "Congratulations! You've won a free vacation. Click here to claim your prize."
    prediction = predict_single_email(spam_filter_model, example_email_text)

    if prediction == 'spam':
        print("This email is classified as spam.")
    else:
        print("This email is classified as ham.")

if __name__ == "__main__":
    main()


Accuracy: 98.78%

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       856
           1       0.98      0.97      0.98       290

    accuracy                           0.99      1146
   macro avg       0.98      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146


Confusion Matrix:
[[850   6]
 [  8 282]]
This email is classified as ham.
