<a href="https://colab.research.google.com/github/arunak451/project/blob/main/email_prediction_using_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import string
import re

# Download NLTK stopwords (run once)
nltk.download('stopwords')

2. Load and Explore the Dataset

In [None]:
# Load the dataset (replace with your dataset path)
# Dataset format should have at least two columns: 'text' and 'label' (0=ham, 1=spam)
df = pd.read_csv('spam_emails.csv')  # Example dataset

# If using the common spam dataset with 'v1' and 'v2' columns:
# df = pd.read_csv('spam.csv', encoding='latin-1')
# df = df[['v1', 'v2']]
# df.columns = ['label', 'text']

print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nLabel Distribution:")
print(df['label'].value_counts())

# Visualize the distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df)
plt.title('Distribution of Spam vs Ham Emails')
plt.show()

3. Text Preprocessing

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# Example of preprocessed text
print("\nOriginal Text:")
print(df['text'].iloc[0])
print("\nProcessed Text:")
print(df['processed_text'].iloc[0])

4. Feature Extraction (TF-IDF)

In [None]:
# Convert text to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_text']).toarray()
y = df['label'].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

5. Model Training and Evaluation
5.1 Naive Bayes Classifier


In [None]:
# Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)

# Evaluation
print("\nNaive Bayes Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))

# Confusion Matrix
cm_nb = confusion_matrix(y_test, y_pred_nb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues')
plt.title('Naive Bayes Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

5.2 Support Vector Machine (SVM)

In [None]:
# SVM Classifier
svm_classifier = SVC(kernel='linear', probability=True)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)

# Evaluation
print("\nSVM Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))

# Confusion Matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Greens')
plt.title('SVM Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

5.3 Random Forest Classifier

In [None]:
# Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

# Evaluation
print("\nRandom Forest Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Oranges')
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

6. Model Comparison and Selection

In [None]:
# Compare model accuracies
models = ['Naive Bayes', 'SVM', 'Random Forest']
accuracies = [accuracy_score(y_test, y_pred_nb),
              accuracy_score(y_test, y_pred_svm),
              accuracy_score(y_test, y_pred_rf)]

plt.figure(figsize=(8, 6))
sns.barplot(x=models, y=accuracies)
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0.8, 1.0)
plt.show()

# Select the best model (here we'll choose SVM based on accuracy)
best_model = svm_classifier

7. Save the Model for Future Use

In [None]:
import joblib
import pickle

# Save the model and vectorizer
joblib.dump(best_model, 'spam_classifier_model.pkl')
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

print("Model and vectorizer saved successfully!")

8. Create a Prediction Function

In [None]:
def predict_spam(email_text):
    # Load the model and vectorizer
    model = joblib.load('spam_classifier_model.pkl')
    vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

    # Preprocess the text
    processed_text = preprocess_text(email_text)

    # Vectorize the text
    text_vector = vectorizer.transform([processed_text]).toarray()

    # Make prediction
    prediction = model.predict(text_vector)
    probability = model.predict_proba(text_vector)

    # Return result
    if prediction[0] == 1:
        return f"SPAM (Confidence: {probability[0][1]*100:.2f}%)"
    else:
        return f"NOT SPAM (Confidence: {probability[0][0]*100:.2f}%)"

# Test the function
test_email = "Congratulations! You've won a $1000 Walmart gift card. Click here to claim your prize now!"
print("\nTest Email Prediction:")
print(predict_spam(test_email))

test_email2 = "Hi John, just checking in about our meeting tomorrow at 2pm. Please let me know if that still works for you."
print("\nTest Email 2 Prediction:")
print(predict_spam(test_email2))

9. Optional: Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Example for SVM tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Use the best estimator
best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test)
print("\nTuned SVM Accuracy:", accuracy_score(y_test, y_pred_best))

10. Deployment Considerations
For deploying this model, you could:

Create a Flask/Django web application

Build a Chrome extension for email clients

Integrate with email servers directly

Create an API endpoint for email services to query

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

# Load model and vectorizer at startup
model = joblib.load('spam_classifier_model.pkl')
vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    email_text = data['text']

    processed_text = preprocess_text(email_text)
    text_vector = vectorizer.transform([processed_text]).toarray()
    prediction = model.predict(text_vector)
    probability = model.predict_proba(text_vector)

    result = {
        'prediction': int(prediction[0]),
        'confidence': float(probability[0][prediction[0]]),
        'is_spam': bool(prediction[0])
    }

    return jsonify(result)

if __name__ == '__main__':
    app.run(debug=True)