In [None]:
""""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('spam_ham_enron.csv')  # Replace with your actual file

# Extract features and labels
X = data.iloc[:, 2]  # Column C has the email text
y = data.iloc[:, 3]  # Column D has the labels (0 = ham, 1 = spam)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
y_pred_proba = model.predict_proba(X_test_tfidf)[:, 1]  # Probability of being spam

accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred))

# Function to predict and show confidence
def predict_email(email_text):
    email_tfidf = vectorizer.transform([email_text])
    prediction = model.predict(email_tfidf)[0]
    confidence = model.predict_proba(email_tfidf)[0, 1]  # Spam probability
    label = "Spam" if prediction == 1 else "Ham"
    confidence_percentage = confidence * 100 if prediction == 1 else (1 - confidence) * 100
    return label, confidence_percentage

# Example usage
email = "Congratulations! You've won a free iPhone. Click here to claim now."
label, confidence = predict_email(email)
print(f'Prediction: {label}, Confidence: {confidence:.2f}%')
"""

In [None]:
'''
email_text = "Dear Students, This is your confirmation that you re all set for the Spring 2025 Internship Fair on Wednesday, February 12th at the Kimmel Center! Your confirmed time slot is: 1:40 PM - 2:55 PM Reminders: Line-up begins 30 minutes before your slot. You will be given a colored wristband when you arrive to check in at the fair. You ll have 75 minutes to make a great impression. Plan your time wisely to maximize your interactions across both floors. Avoid coats, backpacks or large bags since there will be no coat check.  Quick Tips:Perfect your pitch: State your name clearly and have a concise pitch ready about your skills and goals. Discuss how your expertise aligns with the company's needs. Rather than preparing full 60-90 second pitches, plan on a brief introduction, a short conversation that may occur in a group of students. Be Prepared: Make sure you ve done advance research to determine which companies at the fair will be hiring for roles you re qualified for.   Don t throw your resume at recruiters: Politely offer your resume if it naturally arises in the conversation.  Ask thoughtful questions: Show you ve done your research ahead of time and avoid questions that cover basic company information. Avoid dominating the conversation: Keep conversations brief and allow others to interact with the recruiter. Thank the recruiter: Express appreciation for their time and ask for contact information for follow-up. "
label, confidence = predict_email(email_text)
print(f'Prediction: {label}, Confidence: {confidence:.2f}%')

In [None]:
"""
email_text = "We have detected unusual activity on your PayPal account. As a security measure, we have temporarily limited your account access. To restore full access, please verify your account by clicking the link below: 🔗 Click Here to Secure Your Account If you do not complete the verification within 24 hours, your account will be permanently suspended. Thank you for choosing PayPal. "
label, confidence = predict_email(email_text)
print(f'Prediction: {label}, Confidence: {confidence:.2f}%')
"""

In [None]:
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def load_and_prepare_data(filepath):
    # Load the dataset
    data = pd.read_csv(filepath, dtype={1: str}, low_memory=False)

    # Extract features and labels
    X = data['Email Text'].astype(str).fillna('')
    y = data['Email Type']

    # Convert labels to binary
    y = y.str.lower()
    y = y.map(lambda x: 0 if 'safe' in str(x) else (1 if 'phish' in str(x) else np.nan))

    # Remove invalid rows
    valid_rows = (X.str.len() > 0) & (y.notna())
    X = X[valid_rows]
    y = y[valid_rows]

    if len(y) == 0:
        raise ValueError("Dataset became empty after cleaning. Check your data format and labels.")

    return X, y

def train_phishing_model(X, y):
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert text data into numerical features
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    # Evaluate
    y_pred = model.predict(X_test_tfidf)
    print(f'Model Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return model, vectorizer

def predict_email(email_text, model, vectorizer):
    """Predict whether an email is phishing or safe."""
    if not email_text.strip():
        return "Invalid Input", 0.0

    email_tfidf = vectorizer.transform([email_text])
    prediction = model.predict(email_tfidf)[0]
    confidence = model.predict_proba(email_tfidf)[0, 1]

    label = "Phishing" if prediction == 1 else "Safe"
    confidence_percentage = confidence * 100 if prediction == 1 else (1 - confidence) * 100

    return label, confidence_percentage

if __name__ == "__main__":
    try:
        # Load and prepare data
        X, y = load_and_prepare_data('PhishingDataset.csv')  # Replace with your file path

        # Train model
        model, vectorizer = train_phishing_model(X, y)

        # Example prediction
        test_email = "Your account has been compromised. Click the link below to reset your password immediately."
        label, confidence = predict_email(test_email, model, vectorizer)
        print(f'\nTest Prediction:\nEmail: {test_email}\nPrediction: {label} Confidence: {confidence:.2f}%')

    except Exception as e:
        print(f"Error: {str(e)}")
        """

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# First, train both models
def train_spam_model(spam_data_path):
    # Load and prepare spam dataset
    data = pd.read_csv(spam_data_path)
    X_spam = data.iloc[:, 2]  # Email text
    y_spam = data.iloc[:, 3]  # Labels (0 = ham, 1 = spam)

    # Split spam data
    X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(
        X_spam, y_spam, test_size=0.2, random_state=42
    )

    # Create and train spam vectorizer and model
    spam_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_spam_tfidf = spam_vectorizer.fit_transform(X_train_spam)
    X_test_spam_tfidf = spam_vectorizer.transform(X_test_spam)

    spam_model = LogisticRegression(max_iter=1000)
    spam_model.fit(X_train_spam_tfidf, y_train_spam)

    # Evaluate spam model
    y_pred_spam = spam_model.predict(X_test_spam_tfidf)
    print("\nSpam Model Performance:")
    print(f'Accuracy: {accuracy_score(y_test_spam, y_pred_spam):.4f}')
    print(classification_report(y_test_spam, y_pred_spam))

    return spam_model, spam_vectorizer

def train_phishing_model(phishing_data_path):
    # Load and prepare phishing dataset
    data = pd.read_csv(phishing_data_path, dtype={1: str}, low_memory=False)
    X_phish = data['Email Text'].astype(str).fillna('')
    y_phish = data['Email Type']

    # Convert phishing labels to binary
    y_phish = y_phish.str.lower()
    y_phish = y_phish.map(lambda x: 0 if 'safe' in str(x) else (1 if 'phish' in str(x) else np.nan))

    # Remove invalid rows
    valid_rows = (X_phish.str.len() > 0) & (y_phish.notna())
    X_phish = X_phish[valid_rows]
    y_phish = y_phish[valid_rows]

    # Split phishing data
    X_train_phish, X_test_phish, y_train_phish, y_test_phish = train_test_split(
        X_phish, y_phish, test_size=0.2, random_state=42
    )

    # Create and train phishing vectorizer and model
    phish_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_phish_tfidf = phish_vectorizer.fit_transform(X_train_phish)
    X_test_phish_tfidf = phish_vectorizer.transform(X_test_phish)

    phish_model = LogisticRegression(max_iter=1000)
    phish_model.fit(X_train_phish_tfidf, y_train_phish)

    # Evaluate phishing model
    y_pred_phish = phish_model.predict(X_test_phish_tfidf)
    print("\nPhishing Model Performance:")
    print(f'Accuracy: {accuracy_score(y_test_phish, y_pred_phish):.4f}')
    print(classification_report(y_test_phish, y_pred_phish))

    return phish_model, phish_vectorizer

def analyze_email(email_text, spam_model, spam_vectorizer, phish_model, phish_vectorizer):
    """Analyze an email for both spam and phishing content."""
    # Check for spam
    spam_tfidf = spam_vectorizer.transform([email_text])
    spam_pred = spam_model.predict(spam_tfidf)[0]
    spam_conf = spam_model.predict_proba(spam_tfidf)[0, 1]
    spam_label = "Spam" if spam_pred == 1 else "Not Spam"
    spam_confidence = spam_conf * 100 if spam_pred == 1 else (1 - spam_conf) * 100

    # Check for phishing
    phish_tfidf = phish_vectorizer.transform([email_text])
    phish_pred = phish_model.predict(phish_tfidf)[0]
    phish_conf = phish_model.predict_proba(phish_tfidf)[0, 1]
    phish_label = "Phishing" if phish_pred == 1 else "Safe"
    phish_confidence = phish_conf * 100 if phish_pred == 1 else (1 - phish_conf) * 100

    # Determine overall threat level
    threat_level = "Low"
    if phish_pred == 1 and spam_pred == 1:
        threat_level = "High"
    elif phish_pred == 1 or spam_pred == 1:
        threat_level = "Medium"

    return {
        "spam_analysis": {
            "label": spam_label,
            "confidence": spam_confidence
        },
        "phishing_analysis": {
            "label": phish_label,
            "confidence": phish_confidence
        },
        "threat_level": threat_level
    }

# Main execution
if __name__ == "__main__":
    # Train both models
    spam_model, spam_vectorizer = train_spam_model('spam_ham_enron.csv')
    phish_model, phish_vectorizer = train_phishing_model('PhishingDataset.csv')

    # Test email
    test_email = """
    URGENT: Your account has been compromised!
    Click here immediately to verify your information: http://suspicious-link.com
    Win a free iPhone today!
    """

    # Analyze test email
    result = analyze_email(test_email, spam_model, spam_vectorizer, phish_model, phish_vectorizer)

    # Print results
    print("\nEmail Analysis Results:")
    print("-" * 50)
    print(f"Spam Check: {result['spam_analysis']['label']} "
          f"(Confidence: {result['spam_analysis']['confidence']:.2f}%)")
    print(f"Phishing Check: {result['phishing_analysis']['label']} "
          f"(Confidence: {result['phishing_analysis']['confidence']:.2f}%)")
    print(f"Overall Threat Level: {result['threat_level']}")


Spam Model Performance:
Accuracy: 0.9865
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       742
           1       0.97      0.98      0.98       293

    accuracy                           0.99      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.99      0.99      0.99      1035


Phishing Model Performance:
Accuracy: 0.9683
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      2245
         1.0       0.95      0.97      0.96      1473

    accuracy                           0.97      3718
   macro avg       0.96      0.97      0.97      3718
weighted avg       0.97      0.97      0.97      3718


Email Analysis Results:
--------------------------------------------------
Spam Check: Spam (Confidence: 86.75%)
Phishing Check: Phishing (Confidence: 98.48%)
Overall Threat Level: High


In [None]:
# Test email
test_email = """Dear Students, This is your confirmation that you re all set for the Spring 2025 Internship Fair on Wednesday, February 12th at the Kimmel Center! Your confirmed time slot is: 1:40 PM - 2:55 PM Reminders: Line-up begins 30 minutes before your slot. You will be given a colored wristband when you arrive to check in at the fair. You ll have 75 minutes to make a great impression. Plan your time wisely to maximize your interactions across both floors. Avoid coats, backpacks or large bags since there will be no coat check.  Quick Tips:Perfect your pitch: State your name clearly and have a concise pitch ready about your skills and goals. Discuss how your expertise aligns with the company's needs. Rather than preparing full 60-90 second pitches, plan on a brief introduction, a short conversation that may occur in a group of students. Be Prepared: Make sure you ve done advance research to determine which companies at the fair will be hiring for roles you re qualified for.   Don t throw your resume at recruiters: Politely offer your resume if it naturally arises in the conversation.  Ask thoughtful questions: Show you ve done your research ahead of time and avoid questions that cover basic company information. Avoid dominating the conversation: Keep conversations brief and allow others to interact with the recruiter. Thank the recruiter: Express appreciation for their time and ask for contact information for follow-up."""

# Analyze test email
result = analyze_email(test_email, spam_model, spam_vectorizer, phish_model, phish_vectorizer)

# Print results
print("\nEmail Analysis Results:")
print("-" * 50)
print(f"Spam Check: {result['spam_analysis']['label']} "
      f"(Confidence: {result['spam_analysis']['confidence']:.2f}%)")
print(f"Phishing Check: {result['phishing_analysis']['label']} "
      f"(Confidence: {result['phishing_analysis']['confidence']:.2f}%)")
print(f"Overall Threat Level: {result['threat_level']}")


Email Analysis Results:
--------------------------------------------------
Spam Check: Not Spam (Confidence: 58.08%)
Phishing Check: Safe (Confidence: 78.34%)
Overall Threat Level: Low


In [None]:
# Test email
test_email = """
We have detected unusual activity on your PayPal account. As a security measure, we have temporarily limited your account access. To restore full access, please verify your account by clicking the link below: 🔗 Click Here to Secure Your Account If you do not complete the verification within 24 hours, your account will be permanently suspended. Thank you for choosing PayPal.
"""

# Analyze test email
result = analyze_email(test_email, spam_model, spam_vectorizer, phish_model, phish_vectorizer)

# Print results
print("\nEmail Analysis Results:")
print("-" * 50)
print(f"Spam Check: {result['spam_analysis']['label']} "
      f"(Confidence: {result['spam_analysis']['confidence']:.2f}%)")
print(f"Phishing Check: {result['phishing_analysis']['label']} "
      f"(Confidence: {result['phishing_analysis']['confidence']:.2f}%)")
print(f"Overall Threat Level: {result['threat_level']}")


Email Analysis Results:
--------------------------------------------------
Spam Check: Spam (Confidence: 72.01%)
Phishing Check: Phishing (Confidence: 93.99%)
Overall Threat Level: High


In [None]:
from flask import Flask, request, jsonify
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk

# Download stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords

app = Flask(__name__)

# Train both models when the server starts
spam_model, spam_vectorizer = None, None
phish_model, phish_vectorizer = None, None

def train_spam_model():
    global spam_model, spam_vectorizer
    data = pd.read_csv("spam_ham_enron.csv")
    X_spam, y_spam = data.iloc[:, 2], data.iloc[:, 3]

    X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(
        X_spam, y_spam, test_size=0.2, random_state=42
    )

    spam_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X_train_spam_tfidf = spam_vectorizer.fit_transform(X_train_spam)

    spam_model = LogisticRegression(max_iter=1000)
    spam_model.fit(X_train_spam_tfidf, y_train_spam)

def train_phishing_model():
    global phish_model, phish_vectorizer
    data = pd.read_csv("PhishingDataset.csv", dtype={1: str}, low_memory=False)
    X_phish = data["Email Text"].astype(str).fillna("")
    y_phish = data["Email Type"].map(lambda x: 0 if "safe" in str(x).lower() else 1)

    X_train_phish, _, y_train_phish, _ = train_test_split(
        X_phish, y_phish, test_size=0.2, random_state=42
    )

    phish_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X_train_phish_tfidf = phish_vectorizer.fit_transform(X_train_phish)

    phish_model = LogisticRegression(max_iter=1000)
    phish_model.fit(X_train_phish_tfidf, y_train_phish)

@app.route("/analyze", methods=["POST"])
def analyze_email():
    data = request.json
    email_text = data.get("emailText", "")

    if not email_text:
        return jsonify({"error": "No email text provided"}), 400

    # Spam Detection
    spam_tfidf = spam_vectorizer.transform([email_text])
    spam_pred = spam_model.predict(spam_tfidf)[0]
    spam_label = "Spam" if spam_pred == 1 else "Not Spam"

    # Phishing Detection
    phish_tfidf = phish_vectorizer.transform([email_text])
    phish_pred = phish_model.predict(phish_tfidf)[0]
    phish_label = "Phishing" if phish_pred == 1 else "Safe"

    return jsonify({
        "spam_analysis": spam_label,
        "phishing_analysis": phish_label
    })

if __name__ == "__main__":
    train_spam_model()
    train_phishing_model()
    app.run(debug=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\serdu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
