In [None]:
# **Phishing Email Detection Using Neural Networks**

# Import Libraries
import pandas as pd
import numpy as np
import nltk
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK data files
nltk.download('stopwords')
nltk.download('punkt')

# Load the Dataset
# Download the dataset
!wget -q https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv

# Read the dataset
df = pd.read_csv('sms.tsv', sep='\t', header=None, names=['label', 'text'])

# Data Preprocessing
# Map 'ham' to 0 and 'spam' to 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Clean and preprocess the text data
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    # Join words back into a single string
    return ' '.join(words)

df['clean_text'] = df['text'].apply(preprocess_text)

# Prepare the data for the neural network
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['clean_text'])

X = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=100)

y = df['label'].values

# Split the Dataset
X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(
    X, y, df['text'], test_size=0.20, random_state=0)

# Build the Neural Network Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the Model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the Model
# Predict on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).reshape(-1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%\n")

# Display classification report
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Display Sample Predictions
print("\nSample Predictions:\n")

# Create a DataFrame with the actual and predicted labels
results_df = pd.DataFrame({
    'Message': text_test,
    'Actual Label': y_test,
    'Predicted Label': y_pred
})

# Map labels back to 'ham' and 'spam'
label_mapping = {0: 'ham', 1: 'spam'}
results_df['Actual Label'] = results_df['Actual Label'].map(label_mapping)
results_df['Predicted Label'] = results_df['Predicted Label'].map(label_mapping)

# Display a few sample messages with their actual and predicted labels
sample_results = results_df.sample(9, random_state=1)
print(sample_results[['Message', 'Actual Label', 'Predicted Label']].to_string(index=False))

# Behavioral Analysis Simulation
# Simulated user behavior data
np.random.seed(0)  # For reproducibility
user_data = pd.DataFrame({
    'user_id': np.arange(1, 101),
    'clicks': np.random.poisson(5, 100),
    'suspicious_downloads': np.random.binomial(1, 0.05, 100),
    'unusual_time_activity': np.random.binomial(1, 0.1, 100)
})

# Identify users with potential phishing interaction
user_data['potential_phishing'] = user_data.apply(
    lambda x: 1 if x['clicks'] > 10 or x['suspicious_downloads'] == 1 or x['unusual_time_activity'] == 1 else 0,
    axis=1
)

# Display users flagged for potential phishing
flagged_users = user_data[user_data['potential_phishing'] == 1]
print("\nFlagged Users for Potential Phishing Attempts:\n")
print(flagged_users.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/5




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 92ms/step - accuracy: 0.8409 - loss: 0.4576 - val_accuracy: 0.9417 - val_loss: 0.1948
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 72ms/step - accuracy: 0.9552 - loss: 0.1524 - val_accuracy: 0.9753 - val_loss: 0.0834
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 99ms/step - accuracy: 0.9853 - loss: 0.0576 - val_accuracy: 0.9865 - val_loss: 0.0591
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 73ms/step - accuracy: 0.9923 - loss: 0.0333 - val_accuracy: 0.9888 - val_loss: 0.0566
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 100ms/step - accuracy: 0.9925 - loss: 0.0308 - val_accuracy: 0.9865 - val_loss: 0.0607
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step

Accuracy: 98.74%

Classification Report:

              precision    recall  f1-score   support

           0       0.99      1.00    