In [2]:
# **Phishing Email Detection Using Machine Learning**

# Import Libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK data files
nltk.download('stopwords')
nltk.download('punkt')

# Load the Dataset
# Download the dataset
!wget -q https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv

# Read the dataset
df = pd.read_csv('sms.tsv', sep='\t', header=None, names=['label', 'text'])

# Data Preprocessing
# Map 'ham' to 0 and 'spam' to 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Initialize the Porter Stemmer
ps = PorterStemmer()

# Preprocess the text data
corpus = []
for message in df['text']:
    # Remove non-alphabetic characters and convert to lowercase
    review = re.sub('[^a-zA-Z]', ' ', message).lower()
    # Tokenize
    review = nltk.word_tokenize(review)
    # Remove stopwords and apply stemming
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    # Join words back into a single string
    review = ' '.join(review)
    corpus.append(review)

# Feature Extraction
# Convert text data into numerical features
tfidf = TfidfVectorizer(max_features=2500)

X = tfidf.fit_transform(corpus).toarray()
y = df['label'].values

print(X)
print(y)

# Split the Dataset
X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(
    X, y, df['text'], test_size=0.20, random_state=0)

# Train the Model
# Initialize the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the Model
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%\n")

# Display classification report
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Display Sample Predictions
print("\nSample Predictions:\n")

# Create a DataFrame with the actual and predicted labels
results_df = pd.DataFrame({
    'Message': text_test,
    'Actual Label': y_test,
    'Predicted Label': y_pred
})

# Map labels back to 'ham' and 'spam'
label_mapping = {0: 'ham', 1: 'spam'}
results_df['Actual Label'] = results_df['Actual Label'].map(label_mapping)
results_df['Predicted Label'] = results_df['Predicted Label'].map(label_mapping)

# Display a few sample messages with their actual and predicted labels
sample_results = results_df.sample(9, random_state=1)
print(sample_results[['Message', 'Actual Label', 'Predicted Label']].to_string(index=False))

# Behavioral Analysis Simulation
# Simulated user behavior data
np.random.seed(0)  # For reproducibility
user_data = pd.DataFrame({
    'user_id': np.arange(1, 101),
    'clicks': np.random.poisson(5, 100),
    'suspicious_downloads': np.random.binomial(1, 0.05, 100),
    'unusual_time_activity': np.random.binomial(1, 0.1, 100)
})

# Identify users with potential phishing interaction
user_data['potential_phishing'] = user_data.apply(
    lambda x: 1 if x['clicks'] > 10 or x['suspicious_downloads'] == 1 or x['unusual_time_activity'] == 1 else 0,
    axis=1
)

# Display users flagged for potential phishing
flagged_users = user_data[user_data['potential_phishing'] == 1]
print("\nFlagged Users for Potential Phishing Attempts:\n")
print(flagged_users.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0 0 1 ... 0 0 0]
Accuracy: 96.50%

Classification Report:

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       955
           1       0.98      0.77      0.86       160

    accuracy                           0.97      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:

[[953   2]
 [ 37 123]]

Sample Predictions:

                                                                                                                                                      Message Actual Label Predicted Label
               URGENT! This is the 2nd attempt to contact U!U have WON £1000CALL 09071512432 b4 300603t&csBCM4235WC1N3XX.callcost150ppmmobilesvary. max£7. 50         spam            spam
   Sad story of a M