In [18]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from scipy.sparse import save_npz
from unidecode import unidecode

"""
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf
CUDA_VISIBLE_DEVICES=""
tf.config.run_functions_eagerly(True)
"""
csv.field_size_limit(100000000)

100000000

In [19]:
def convert_to_ascii(text):
    original_chars = set(text)
    converted_text = unidecode(text)
    converted_chars = set(converted_text)
    
    conversions = sum(1 for orig, conv in zip(text, converted_text) if orig != conv)
    deletions = len(original_chars - converted_chars)

    return converted_text, deletions, conversions

def parse_csv(filename):
    data = []
    with open(filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)  # Skip the header row
        for row in reader:
            email_text = row[1].replace('""', '"')  # Replace double double quotes with a single quote
            converted_text, d, c = convert_to_ascii(email_text)
            email_type = 0 if row[2]=="Safe Email" else 1
            data.append({'Email Text': converted_text, 'Email Type': email_type})

    data = pd.DataFrame(data)
    return data

# Example usage
filename = 'data/Phishing_Email.csv'
parsed_data = parse_csv(filename)

print(len(parsed_data))

18650


In [3]:
ind = 6293

In [4]:
print(parsed_data.iloc[ind]['Email Text'])
print(parsed_data.iloc[ind]['Email Type'])
print(ind)
ind+=1

i? 1/2@ 
      
A A A 
A  i? 1/2i? 1/2i? 1/2~i? 1/2Wi? 1/2i? 1/2i? 1/2G i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2
A A  
      i? 1/2mi? 1/2@A A A  i? 1/2Wi? 1/2Gi? 1/2i? 1/2i? 1/2i? 1/2i? 1/2gi? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2Wi? 1/2Ai? 1/2i? 1/2i? 1/2i? 1/2i? 1/2gi? 1/2i? 1/2i? 1/2i? 1/2A A  i? 1/2i? 1/2A A A  i? 1/2@i? 1/2}i? 1/2G
A A  
      i? 1/2i? 1/2i? 1/2vi? 1/2qi? 1/2i? 1/2i? 1/2G
A A  i? 1/2i? 1/2i? 1/2qi? 1/2qi? 1/2i? 1/2i? 1/2G
A A  i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2qi? 1/2i? 1/2i? 1/2G
A A  
      i? 1/2qi? 1/2li? 1/2li? 1/2i? 1/2i? 1/2G
A A  
      i? 1/2i? 1/2i? 1/2@i? 1/2si? 1/2i? 1/2i? 1/2G
A A  
      i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2hi? 1/2i? 1/2i? 1/2Ki? 1/2Oi? 1/2i? 1/2i? 1/2~i? 1/2i? 1/2i? 1/2Ti? 1/2qi? 1/2li? 1/2i? 1/2A  i? 1/2i? 1/2i? 1/2@i? 1/2@i? 1/2i? 1/2i? 1/2Gi? 1/2i? 1/2i? 1/2i? 1/2i? 1/2Hi? 1/2ui? 1/2i? 1/2i? 1/2ti? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2i? 1/2~i? 1/2Bi? 1/2O150i? 1/2i? 1/2i? 1

In [20]:
X = parsed_data['Email Text'].astype(str)
y = parsed_data['Email Type'].to_numpy()

max_features = 2000

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [6]:
# Save the TF-IDF transformed data to a file
save_npz(f'trained_weights/tfidf/X_train_tfidf_{max_features}.npz', X_train_tfidf)
save_npz(f'trained_weights/tfidf/X_test_tfidf_{max_features}.npz', X_test_tfidf)
np.save(f'trained_weights/tfidf/y_train_{max_features}.npy', y_train)
np.save(f'trained_weights/tfidf/y_test_{max_features}.npy', y_test)

In [4]:
from scipy.sparse import load_npz

loaded_X_train_tfidf = load_npz('trained_weights/X_train_tfidf.npz')
loaded_X_test_tfidf = load_npz('trained_weights/X_test_tfidf.npz')

In [7]:
feature_names = tfidf_vectorizer.get_feature_names_out()
avg_tfidf_values = X_train_tfidf.mean(axis=0).A1
sorted_features = sorted(zip(feature_names, avg_tfidf_values), key=lambda x: x[1], reverse=True)

# Print the top N features and their average TF-IDF values
top_n = 20
print(f"\nTop {top_n} features based on average TF-IDF values:")
for feature, avg_tfidf in sorted_features[:top_n]:
    print(f"{feature}: {avg_tfidf}")


sorted_features = sorted(zip(feature_names, avg_tfidf_values), key=lambda x: x[1], reverse=False)
top_n = 20
print(f"\nBottom {top_n} features based on average TF-IDF values:")
for feature, avg_tfidf in sorted_features[:top_n]:
    print(f"{feature}: {avg_tfidf}")


Top 20 features based on average TF-IDF values:
com: 0.06505584998819713
http: 0.0601437468636956
www: 0.03993296759201532
enron: 0.038111761674932465
list: 0.03427580846656208
click: 0.032517164075461624
email: 0.031894832506998275
net: 0.03175136419248079
new: 0.030216339953742174
linux: 0.02818042411875522
like: 0.0277816678066614
time: 0.02763379952372797
free: 0.027149882408005806
just: 0.027050625077331653
2002: 0.02576290213794713
mail: 0.025564054920219664
information: 0.02551702901314046
00: 0.02517331154640784
know: 0.024393181139255017
10: 0.024383400461901912

Bottom 20 features based on average TF-IDF values:
3d: 0.001416883846635995
2i: 0.0023503930726647237
submissions: 0.0024443239585678084
semantics: 0.0026811830526685307
submission: 0.0028400696987123577
abstract: 0.0028739118332935503
discourse: 0.003016307205766285
york: 0.003427725352473109
abstracts: 0.0034375680165525902
session: 0.0034661635665790263
acquisition: 0.003481078637361065
speakers: 0.003547384173370

In [24]:
# Specify the number of components
n_components = 300  # You can adjust this based on your needs

# Run PCA
pca = PCA(n_components=n_components, random_state=42)
pca_result = pca.fit_transform(X_train_tfidf.toarray())
pca_test = pca.transform(X_test_tfidf.toarray())

print(pca_result.shape)
print(pca_test.shape)
print(type(pca_result))

In [27]:
np.save(f'trained_weights/tfidf/X_train_tfidf_pca.npy', pca_result)
np.save(f'trained_weights/tfidf/X_test_tfidf_pca.npy', pca_test)
np.save(f'trained_weights/tfidf/y_train_pca.npy', y_train)
np.save(f'trained_weights/tfidf/y_test_pca.npy', y_test)

In [11]:
print(X_train_tfidf.shape)
print(len(y_train))
print(X_test_tfidf.shape)
print(len(y_test))
print(y_train[0])
print(y_train[2])

(14920, 300)
14920
(3730, 300)
3730
0
0


In [20]:
X_train_tfidf_train = X_train_tfidf.toarray()
#X_train_tfidf_train = X_train_tfidf.toarray().reshape((X_train_tfidf.shape[0], X_train_tfidf.shape[1], 1))
X_test_tfidf_train = X_test_tfidf.toarray()
#X_test_tfidf_train = X_test_tfidf.toarray().reshape((X_test_tfidf.shape[0], X_test_tfidf.shape[1], 1))


print(X_train_tfidf_train.shape)
print(y_train.shape)
print(X_train_tfidf.toarray().shape)

print(type(X_train_tfidf_train))
print(type(y_train))
print(type(y_train[0]))
X_train_tfidf_train.astype('float64')

y_train = y_train.astype('int32')
y_test = y_test.astype('int32')

# Build a fully connected feedforward neural network
model = Sequential()
model.add(Dense(128, input_shape=(X_train_tfidf_train.shape[1],), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification, adjust for your task

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# Train the model
model.fit(X_train_tfidf_train, y_train, epochs=10, batch_size=64, validation_data=(X_test_tfidf_train, y_test))

(14920, 300)
(14920,)
(14920, 300)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.int32'>
Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 128)               38528     
                                                                 
 dense_41 (Dense)            (None, 128)               16512     
                                                                 
 dense_42 (Dense)            (None, 128)               16512     
                                                                 
 dense_43 (Dense)            (None, 1)                 129       
                                                                 
Total params: 71681 (280.00 KB)
Trainable params: 71681 (280.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10


2023-11-23 02:46:10.655099: W tensorflow/core/framework/op_kernel.cc:1827] UNKNOWN: JIT compilation failed.


UnknownError: Exception encountered when calling layer 'dense_43' (type Dense).

{{function_node __wrapped__Sigmoid_device_/job:localhost/replica:0/task:0/device:GPU:0}} JIT compilation failed. [Op:Sigmoid] name: 

Call arguments received by layer 'dense_43' (type Dense):
  • inputs=tf.Tensor(shape=(64, 128), dtype=float32)

In [None]:
# Build the CNN model
model = models.Sequential()
model.add(layers.Conv1D(32, 3, activation='relu', input_shape=(5000, 1)))
model.add(layers.MaxPooling1D(2))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))  # Change to sigmoid for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# Define a custom callback to print intermediate outputs
class IntermediateOutputCallback(Callback):
    def __init__(self, layer_names):
        super(IntermediateOutputCallback, self).__init__()
        self.layer_names = layer_names

    def on_epoch_end(self, epoch, logs=None):
        intermediate_layer_models = [tf.keras.Model(inputs=model.input, outputs=model.get_layer(name).output) for name in self.layer_names]

        for i, name in enumerate(self.layer_names):
            intermediate_output = intermediate_layer_models[i].predict(X_train_tfidf)
            print(f"Intermediate Output of Layer '{name}' during epoch {epoch + 1}: {intermediate_output}")

# Instantiate the callback with the names of layers for which you want to inspect outputs
callback = IntermediateOutputCallback(layer_names=['conv1d', 'dense36'])

#print(type(X_test_tfidf_train))

# Train the model with the custom callback
model.fit(X_train_tfidf_train, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[callback])

# Train the model
#model.fit(X_train_tfidf_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_tfidf_train, y_test)
print(f'Test accuracy: {test_acc}')