In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
df= pd.read_csv("Phishing_Email.csv")
df = df.dropna()
print(df.isna().sum())

Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64


In [8]:
email_type_counts = df['Email Type'].value_counts()
print(email_type_counts)

Email Type
Safe Email        11322
Phishing Email     7312
Name: count, dtype: int64


In [9]:
Safe_Email = df[df["Email Type"]== "Safe Email"]
Phishing_Email = df[df["Email Type"]== "Phishing Email"]
Safe_Email = Safe_Email.sample(Phishing_Email.shape[0])

In [10]:
Safe_Email.shape,Phishing_Email.shape

((7312, 3), (7312, 3))

In [12]:
Data= pd.concat([Safe_Email, Phishing_Email], ignore_index = True)
Data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,1466,marketing support presentation ( mike mcconnel...,Safe Email
1,7999,reduplicative constructions and polarity morav...,Safe Email
2,5281,"\n----- Original Message -----\nFrom: ""John Ha...",Safe Email
3,5729,"conf on maritime terminology dear colleague , ...",Safe Email
4,5976,"URL: http://www.newsisfree.com/click/-1,839012...",Safe Email


In [13]:
Data.tail()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
14619,18635,congratulations you have won ! ! ! pls contact...,Phishing Email
14620,18638,empty,Phishing Email
14621,18639,strong buy alert : monthly newsletter topstock...,Phishing Email
14622,18646,date a lonely housewife always wanted to date ...,Phishing Email
14623,18650,empty,Phishing Email


In [14]:
X = Data["Email Text"].values
y = Data["Email Type"].values

In [23]:
def transform_email_labels(emails):
  """
  Transforms a numpy array of email labels ("Phishing Email" or "Safe Email")
  into a numpy array with 1 for phishing and 0 for safe.

  Args:
      emails: A numpy array of strings representing email labels.

  Returns:
      A numpy array of integers with 1 for phishing and 0 for safe.
  """
  # Create a dictionary mapping labels to their corresponding values (1 for phishing, 0 for safe)
  label_map = {"Phishing Email": 1, "Safe Email": 0}
  
  # Use vectorized string comparison with np.vectorize
  return np.vectorize(lambda x: label_map.get(x))(emails)

transformed_y = transform_email_labels(y)
print(y, transformed_y)

['Safe Email' 'Safe Email' 'Safe Email' ... 'Phishing Email'
 'Phishing Email' 'Phishing Email'] [0 0 0 ... 1 1 1]


In [24]:
# Preprocess the data
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
data = pad_sequences(sequences, maxlen=200)

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data, transformed_y, test_size = 0.2, random_state=42)

In [32]:
# Build the model
model = models.Sequential()
model.add(layers.Embedding(15000, 128, input_length=200))
model.add(layers.LSTM(64))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)


Epoch 1/10


2024-03-29 17:12:00.639118: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-29 17:12:00.639789: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-29 17:12:00.640589: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3166f3e10>

In [33]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

 3/92 [..............................] - ETA: 2s - loss: 0.0908 - accuracy: 0.9792 

2024-03-29 17:19:26.632801: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-29 17:19:26.633277: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-29 17:19:26.634148: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Test accuracy: 0.9582905769348145
