# Installs
torch==1.9.0
transformers==4.9.2


In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd


class URLDataset(Dataset):
    def __init__(self, urls, labels, tokenizer, max_length):
        self.urls = urls
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.urls)

    def __getitem__(self, idx):
        url = str(self.urls[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            url,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }



In [2]:
torch.cuda.is_available()
torch.cuda.device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
# training parameters
MAX_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 3

# pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4).to(device)


urls = pd.read_csv("datasets/malicious_phish.csv")['url']
labels = pd.read_csv("datasets/feature_updated_dataset_y.csv")['type_val']

df_combined = pd.concat([urls, labels], axis=1)
df_sampled = df_combined.sample(n=100000)

train_urls, val_urls, train_labels, val_labels = train_test_split(df_sampled['url'].values, df_sampled['type_val'].values, test_size=0.3)

train_dataset = URLDataset(train_urls, train_labels, tokenizer, MAX_LENGTH)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = URLDataset(val_urls, val_labels, tokenizer, MAX_LENGTH)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
batchNo = 0
# Training loop
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluation on validation set
    model.eval()
    val_predictions = []
    val_targets = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)

            val_predictions.extend(predicted.cpu().numpy())
            val_targets.extend(labels.cpu().numpy())
            batchNo += 1

    val_accuracy = accuracy_score(val_targets, val_predictions)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Validation Accuracy: {val_accuracy:.4f}')

# Save the trained model
model.save_pretrained("bert_model_url_classification")

Epoch 1/3, Validation Accuracy: 0.9767
Epoch 2/3, Validation Accuracy: 0.9746
Epoch 3/3, Validation Accuracy: 0.9798


In [None]:
val2_dataset = URLDataset(val_urls, val_labels, tokenizer, MAX_LENGTH)
val2_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
val2_predictions = []
val2_targets = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)

        val2_predictions.extend(predicted.cpu().numpy())
        val2_targets.extend(labels.cpu().numpy())

val2_accuracy = accuracy_score(val2_targets, val2_predictions)
print(f'Epoch {epoch + 1}/{EPOCHS}, Validation Accuracy: {val2_accuracy:.4f}')