In [None]:
# !pip install transformers
# !pip install SentencePiece

# Imports

In [None]:
import pandas as pd
from pathlib import Path
import os
import optuna
import numpy as np 

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Huggingface Transformers
from transformers import RobertaTokenizer, RobertaForSequenceClassification, DebertaTokenizer, DebertaForSequenceClassification
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, AlbertTokenizer, AlbertForSequenceClassification

# For eval
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [None]:
export_dir = Path(os.getcwd())
data_path = Path(export_dir, "data")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

models_dir = Path(export_dir, 'models')
#os.makedirs(models_dir, exist_ok=True)

## Load Ron's data set

In [None]:
train_df = pd.read_csv(Path(data_path,'train_data_only_text_and_labels.csv'))
test_df = pd.read_csv(Path(data_path,'test_data_only_text_and_labels.csv'))
eval_df = pd.read_csv(Path(data_path,'eval_data_only_text_and_labels.csv'))

In [None]:
class EmailDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.texts = dataframe['text_combined'].tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=512,  ####################### Adjust as needed
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Write here you model name

In [None]:
model_name = "RoBERTa" #phishing-email-detection , DeBERTa, distilbert, ALBERT

num_epochs = 10

# Load Tokenizers and Pre-trained Models

In [None]:
if model_name == "RoBERTa": 
    # RoBERTa tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2).to(device)
    # Freeze all layers except the last two transformer layers
    for param in model.roberta.parameters():
        param.requires_grad = False
    for param in model.roberta.encoder.layer[-2:].parameters():
        param.requires_grad = True
    for param in model.classifier.parameters():
        param.requires_grad = True
        
        
if model_name == "DeBERTa": 
    # DeBERTa tokenizer and model
    tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-large')
    model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-large', num_labels=2).to(device)
    # Freeze all layers except the last two transformer layers
    for param in model.deberta.parameters():
        param.requires_grad = False
    for param in model.deberta.encoder.layer[-2:].parameters():
        param.requires_grad = True
    for param in model.classifier.parameters():
        param.requires_grad = True

        
if model_name == "phishing-email-detection": 
    # distilbert model and tokenizer
    model_name = "dima806/phishing-email-detection"
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    for param in model.distilbert.parameters():
        param.requires_grad = False
    for param in model.distilbert.transformer.layer[-2:].parameters():
        param.requires_grad = True
    for param in model.classifier.parameters():
        param.requires_grad = True

    
if model_name == "distilbert":
    model_name = "distilbert-base-uncased"
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
    for param in model.distilbert.parameters():
        param.requires_grad = False
    # Unfreeze only the last two transformer layers
    for param in model.distilbert.transformer.layer[-2:].parameters():
        param.requires_grad = True
    # Ensure the classifier head is trainable
    for param in model.classifier.parameters():
        param.requires_grad = True

        
if model_name == "ALBERT": 
    # ALBERT tokenizer and model
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2).to(device)
    
    # Freeze all layers except the last two transformer layers
    for param in model.albert.parameters():
        param.requires_grad = False
    for param in model.albert.encoder.albert_layer_groups[-2:].parameters():
        param.requires_grad = True
    for param in model.classifier.parameters():
        param.requires_grad = True

In [None]:
train_dataset = EmailDataset(train_df, tokenizer)
val_dataset = EmailDataset(eval_df, tokenizer)
test_dataset = EmailDataset(test_df, tokenizer)


train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

## Fine tuning

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)  # Only optimize the parameters that require gradients

for epoch in range(num_epochs):
    # Training phase
    model.train()  # Set the model to training mode
    total_train_loss = 0
    
    i = 0 
    for batch in train_loader:

        optimizer.zero_grad()  # Clear gradients

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        i+=1
        if i%50==0:
            print(i)

    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}')

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0
    val_labels = []
    val_preds = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            total_val_loss += loss.item()

            # Collect predictions and true labels for evaluation
            _, preds = torch.max(outputs.logits, dim=1)
            val_labels.extend(labels.cpu().numpy())
            val_preds.extend(preds.cpu().numpy())


    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}')

    # Calculate validation accuracy
    val_accuracy = (np.array(val_labels) == np.array(val_preds)).mean()
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}')

    # Save the model after each epoch
    model_save_path = Path(models_dir, f'finetune_{model_name}_epoch_{epoch}_acc_{val_accuracy:.4f}.pt')
    torch.save(model.state_dict(), model_save_path)