In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
df= pd.read_csv("Phishing_Email.csv")
df = df.dropna()
print(df.isna().sum())

Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64


In [8]:
email_type_counts = df['Email Type'].value_counts()
print(email_type_counts)

Email Type
Safe Email        11322
Phishing Email     7312
Name: count, dtype: int64


In [9]:
Safe_Email = df[df["Email Type"]== "Safe Email"]
Phishing_Email = df[df["Email Type"]== "Phishing Email"]
Safe_Email = Safe_Email.sample(Phishing_Email.shape[0])

In [10]:
Safe_Email.shape,Phishing_Email.shape

((7312, 3), (7312, 3))

In [12]:
Data= pd.concat([Safe_Email, Phishing_Email], ignore_index = True)
Data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,1466,marketing support presentation ( mike mcconnel...,Safe Email
1,7999,reduplicative constructions and polarity morav...,Safe Email
2,5281,"\n----- Original Message -----\nFrom: ""John Ha...",Safe Email
3,5729,"conf on maritime terminology dear colleague , ...",Safe Email
4,5976,"URL: http://www.newsisfree.com/click/-1,839012...",Safe Email


In [13]:
Data.tail()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
14619,18635,congratulations you have won ! ! ! pls contact...,Phishing Email
14620,18638,empty,Phishing Email
14621,18639,strong buy alert : monthly newsletter topstock...,Phishing Email
14622,18646,date a lonely housewife always wanted to date ...,Phishing Email
14623,18650,empty,Phishing Email


In [14]:
X = Data["Email Text"].values
y = Data["Email Type"].values

In [23]:
def transform_email_labels(emails):
  """
  Transforms a numpy array of email labels ("Phishing Email" or "Safe Email")
  into a numpy array with 1 for phishing and 0 for safe.

  Args:
      emails: A numpy array of strings representing email labels.

  Returns:
      A numpy array of integers with 1 for phishing and 0 for safe.
  """
  # Create a dictionary mapping labels to their corresponding values (1 for phishing, 0 for safe)
  label_map = {"Phishing Email": 1, "Safe Email": 0}
  
  # Use vectorized string comparison with np.vectorize
  return np.vectorize(lambda x: label_map.get(x))(emails)

transformed_y = transform_email_labels(y)
print(y, transformed_y)

['Safe Email' 'Safe Email' 'Safe Email' ... 'Phishing Email'
 'Phishing Email' 'Phishing Email'] [0 0 0 ... 1 1 1]


In [24]:
# Preprocess the data
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
data = pad_sequences(sequences, maxlen=200)

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data, transformed_y, test_size = 0.2, random_state=42)

In [32]:
# Build the model
model = models.Sequential()
model.add(layers.Embedding(15000, 128, input_length=200))
model.add(layers.LSTM(64))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)


Epoch 1/10


2024-03-29 17:12:00.639118: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-29 17:12:00.639789: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-29 17:12:00.640589: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3166f3e10>

In [47]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

 3/92 [..............................] - ETA: 2s - loss: 0.0908 - accuracy: 0.9792 

2024-03-29 23:16:34.965201: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-29 23:16:34.965641: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-29 23:16:34.966220: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Test accuracy: 0.9582905769348145


In [34]:
model.save('nn_phishing_model.h5')  # Saves the model to a file

In [37]:
from tensorflow.keras.models import load_model

# Load the model from the file
model = load_model('nn_phishing_model.h5')

2024-03-29 22:16:56.101549: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-29 22:16:56.101965: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-29 22:16:56.102442: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [43]:
test = '''
Dear Customer,

We have detected unusual activity on your account and suspect an unauthorized transaction attempt. As a precaution, we have temporarily suspended your account access.

To verify your identity and reactivate your account, please click on the link below and follow the instructions:

Verify My Account

Failure to complete the verification within 24 hours will result in permanent account suspension.

Thank you for your prompt attention to this matter.

Best regards,
Customer Support Team
'''
# Assuming 'tokenizer' is the Tokenizer instance used during training
# If the tokenizer was not saved, you need to recreate and fit it on the same corpus as before
sequences = tokenizer.texts_to_sequences(test)
some_input_data = pad_sequences(sequences, maxlen=2000)
prediction = model.predict(some_input_data)
print(prediction)

2024-03-29 22:25:43.844811: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-29 22:25:43.845227: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-29 22:25:43.846249: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

[[0.7230367 ]
 [0.8319383 ]
 [0.8798721 ]
 [0.806934  ]
 [0.7427742 ]
 [0.7230367 ]
 [0.6033756 ]
 [0.66768205]
 [0.7976829 ]
 [0.94923335]
 [0.6276753 ]
 [0.5816568 ]
 [0.8798721 ]
 [0.7427742 ]
 [0.7230367 ]
 [0.7230367 ]
 [0.7230367 ]
 [0.7075385 ]
 [0.8798721 ]
 [0.7230367 ]
 [0.5599422 ]
 [0.806934  ]
 [0.9202582 ]
 [0.8798721 ]
 [0.7230367 ]
 [0.8319383 ]
 [0.8798721 ]
 [0.94923335]
 [0.8798721 ]
 [0.6033756 ]
 [0.94923335]
 [0.8798721 ]
 [0.8319383 ]
 [0.7230367 ]
 [0.66768205]
 [0.94734687]
 [0.66768205]
 [0.7976829 ]
 [0.66768205]
 [0.806934  ]
 [0.4935353 ]
 [0.7230367 ]
 [0.806934  ]
 [0.6033756 ]
 [0.94923335]
 [0.6926522 ]
 [0.9202582 ]
 [0.6926522 ]
 [0.94923335]
 [0.93045795]
 [0.7230367 ]
 [0.6276753 ]
 [0.94734687]
 [0.7230367 ]
 [0.93045795]
 [0.6276753 ]
 [0.66768205]
 [0.7427742 ]
 [0.7230367 ]
 [0.806934  ]
 [0.6033756 ]
 [0.6033756 ]
 [0.6276753 ]
 [0.66768205]
 [0.94734687]
 [0.94923335]
 [0.7230367 ]
 [0.806934  ]
 [0.94734687]
 [0.8319383 ]
 [0.7230367 ]
 [0.79

In [46]:
average_prediction = prediction.mean()
certainties = np.abs(prediction - 0.5) * 2  # Scale the distance from 0.5 to a [0, 1] range
average_certainty = certainties.mean()

# Print the average prediction
print(f"Average prediction probability: {average_prediction:.2f}")
# Print the average certainty
print(f"Average certainty of prediction: {average_certainty:.2f}")

Average prediction probability: 0.76
Average certainty of prediction: 0.52
