In [None]:
!pip install transformers datasets evaluate accelerate

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
import evaluate

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
from transformers import pipeline

In [None]:
ds = load_dataset("zefang-liu/phishing-email-dataset")
df = ds['train'].to_pandas()
df['Email Type'].replace('Safe Email', 0, inplace=True)
df['Email Type'].replace('Phishing Email', 1, inplace=True)
df = df.rename(columns={'Email Text': 'text', 'Email Type': 'label'})
df=df.dropna()
df = df.drop('Unnamed: 0', axis=1)

test_df = df.sample(n=600)
train_and_val_df = df.drop(test_df.index)

train, val = train_test_split(train_and_val_df, test_size=0.2, random_state=42)
train, val = Dataset.from_pandas(train, preserve_index=False), Dataset.from_pandas(val, preserve_index=False)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [4]:
def tokenize_function(examples):
    if isinstance(examples["text"], list):
        examples["text"] = [str(text) for text in examples["text"]]
    else:
        examples["text"] = str(examples["text"])
    return tokenizer(examples["text"],padding='max_length', truncation=True,max_length=512,return_tensors='pt')

In [None]:
tokenized_train = train.map(tokenize_function, batched=True)
tokenized_val = val.map(tokenize_function, batched=True)

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
accuracy = evaluate.load("accuracy")

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
id2label = {0: "Safe Email", 1: "Phishing Email"}
label2id = {"Safe Email": 0, "Phishing Email": 1}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [11]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [13]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_train,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_val,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [14]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [15]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="modelname",
    tokenizer=tokenizer,
)

In [16]:
callbacks = [metric_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

In [None]:
text = "Dear Mr. Smith, please find attached the protocol of our meeting today. Best regards, John Walker"

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
classifier(text)

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def classify_email(text: str) -> str:
    response = classifier(text)

    return response[0]["label"]

test_df["Prediction"] = test_df["text"].progress_apply(classify_email)

In [None]:
test_df = test_df.replace({"Prediction": label2id})

TP = ((test_df["label"] == 1) & (test_df["Prediction"] == 1)).sum()
TN = ((test_df["label"] == 0) & (test_df["Prediction"] == 0)).sum()
FP = ((test_df["label"] == 0) & (test_df["Prediction"] == 1)).sum()
FN = ((test_df["label"] == 1) & (test_df["Prediction"] == 0)).sum()

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)
accuracy = (test_df["label"] == test_df["Prediction"]).sum() / len(test_df)

print(f"Accuracy: {accuracy:.2%}")
print(f"precision: {precision:.2%}")
print(f"recall: {recall:.2%}")
print(f"f1_score: {f1_score}")