In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df= pd.read_csv("Phishing_Email.csv")
df = df.dropna()
print(df.isna().sum())

Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64


In [3]:
email_type_counts = df['Email Type'].value_counts()
print(email_type_counts)

Email Type
Safe Email        11326
Phishing Email     7312
Name: count, dtype: int64


In [4]:
Safe_Email = df[df["Email Type"]== "Safe Email"]
Phishing_Email = df[df["Email Type"]== "Phishing Email"]
# Safe_Email = Safe_Email.sample(Phishing_Email.shape[0])

In [5]:
Safe_Email.shape,Phishing_Email.shape

((11326, 3), (7312, 3))

In [6]:
Data= pd.concat([Safe_Email, Phishing_Email], ignore_index = True)
Data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0.0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1.0,the other side of * galicismos * * galicismo *...,Safe Email
2,2.0,re : equistar deal tickets are you still avail...,Safe Email
3,5.0,global risk management operations sally congra...,Safe Email
4,6.0,"On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...",Safe Email


In [7]:
Data.tail()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
18633,18635.0,congratulations you have won ! ! ! pls contact...,Phishing Email
18634,18638.0,empty,Phishing Email
18635,18639.0,strong buy alert : monthly newsletter topstock...,Phishing Email
18636,18646.0,date a lonely housewife always wanted to date ...,Phishing Email
18637,18650.0,empty,Phishing Email


In [8]:
X = Data["Email Text"].values
y = Data["Email Type"].values

In [9]:
def transform_email_labels(emails):
  """
  Transforms a numpy array of email labels ("Phishing Email" or "Safe Email")
  into a numpy array with 1 for phishing and 0 for safe.

  Args:
      emails: A numpy array of strings representing email labels.

  Returns:
      A numpy array of integers with 1 for phishing and 0 for safe.
  """
  # Create a dictionary mapping labels to their corresponding values (1 for phishing, 0 for safe)
  label_map = {"Phishing Email": 1, "Safe Email": 0}
  
  # Use vectorized string comparison with np.vectorize
  return np.vectorize(lambda x: label_map.get(x))(emails)

transformed_y = transform_email_labels(y)
print(y, transformed_y)

['Safe Email' 'Safe Email' 'Safe Email' ... 'Phishing Email'
 'Phishing Email' 'Phishing Email'] [0 0 0 ... 1 1 1]


In [10]:
# Preprocess the data
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
data = pad_sequences(sequences, maxlen=200)

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data, transformed_y, test_size = 0.2, random_state=42)

In [13]:
# Build the model
model = models.Sequential()
model.add(layers.Embedding(15000, 128))
model.add(layers.LSTM(64))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)


Epoch 1/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 71ms/step - accuracy: 0.8563 - loss: 0.3090
Epoch 2/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 70ms/step - accuracy: 0.9762 - loss: 0.0618
Epoch 3/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 72ms/step - accuracy: 0.9542 - loss: 0.1135
Epoch 4/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 70ms/step - accuracy: 0.9829 - loss: 0.0394
Epoch 5/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 70ms/step - accuracy: 0.9862 - loss: 0.0296
Epoch 6/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 71ms/step - accuracy: 0.9857 - loss: 0.0320
Epoch 7/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 68ms/step - accuracy: 0.9858 - loss: 0.0344
Epoch 8/10
[1m466/466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 67ms/step - accuracy: 0.9876 - loss: 0.0245
Epoch 9/10
[1m466/466[

<keras.src.callbacks.history.History at 0x308228b10>

In [14]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.9598 - loss: 0.1289
Test accuracy: 0.9629828333854675


In [17]:
model.save('nn_phishing_model.keras')  # Saves the model to a file

In [19]:
from tensorflow.keras.models import load_model

# Load the model from the file
model = load_model('nn_phishing_model.keras')

  trackable.load_own_variables(weights_store.get(inner_path))


In [20]:
test = '''
Dear Customer,

We have detected unusual activity on your account and suspect an unauthorized transaction attempt. As a precaution, we have temporarily suspended your account access.

To verify your identity and reactivate your account, please click on the link below and follow the instructions:

Verify My Account

Failure to complete the verification within 24 hours will result in permanent account suspension.

Thank you for your prompt attention to this matter.

Best regards,
Customer Support Team
'''
# Assuming 'tokenizer' is the Tokenizer instance used during training
# If the tokenizer was not saved, you need to recreate and fit it on the same corpus as before
sequences = tokenizer.texts_to_sequences(test)
some_input_data = pad_sequences(sequences, maxlen=2000)
prediction = model.predict(some_input_data)
print(prediction)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 183ms/step
[[0.62613124]
 [0.7042114 ]
 [0.85430413]
 [0.56687206]
 [0.87875247]
 [0.62613124]
 [0.23160928]
 [0.74875367]
 [0.8094021 ]
 [0.92900205]
 [0.46629503]
 [0.35074914]
 [0.85430413]
 [0.87875247]
 [0.62613124]
 [0.62613124]
 [0.62613124]
 [0.6401479 ]
 [0.85430413]
 [0.62613124]
 [0.34294665]
 [0.56687206]
 [0.9163223 ]
 [0.85430413]
 [0.62613124]
 [0.7042114 ]
 [0.85430413]
 [0.92900205]
 [0.85430413]
 [0.23160928]
 [0.92900205]
 [0.85430413]
 [0.7042114 ]
 [0.62613124]
 [0.74875367]
 [0.7880819 ]
 [0.74875367]
 [0.8094021 ]
 [0.74875367]
 [0.56687206]
 [0.62601477]
 [0.62613124]
 [0.56687206]
 [0.23160928]
 [0.92900205]
 [0.5086969 ]
 [0.9163223 ]
 [0.5086969 ]
 [0.92900205]
 [0.9206246 ]
 [0.62613124]
 [0.46629503]
 [0.7880819 ]
 [0.62613124]
 [0.9206246 ]
 [0.46629503]
 [0.74875367]
 [0.87875247]
 [0.62613124]
 [0.56687206]
 [0.23160928]
 [0.23160928]
 [0.46629503]
 [0.74875367]
 [0.7880819 ]
 [0.92900205]
 

In [21]:
average_prediction = prediction.mean()
certainties = np.abs(prediction - 0.5) * 2  # Scale the distance from 0.5 to a [0, 1] range
average_certainty = certainties.mean()

# Print the average prediction
print(f"Average prediction probability: {average_prediction:.2f}")
# Print the average certainty
print(f"Average certainty of prediction: {average_certainty:.2f}")

Average prediction probability: 0.67
Average certainty of prediction: 0.45
