In [None]:
#pip install transformers datasets

In [None]:
import re
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from collections import Counter
import string

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset_B_05_2020.csv')

In [None]:
urls = df['url']
df['status'] = df['status'].map({'legitimate': 0, 'phishing': 1})

In [None]:
df = df.rename(columns={"status": "labels"})

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11430 non-null  object 
 1   length_url                  11430 non-null  int64  
 2   length_hostname             11430 non-null  int64  
 3   ip                          11430 non-null  int64  
 4   nb_dots                     11430 non-null  int64  
 5   nb_hyphens                  11430 non-null  int64  
 6   nb_at                       11430 non-null  int64  
 7   nb_qm                       11430 non-null  int64  
 8   nb_and                      11430 non-null  int64  
 9   nb_or                       11430 non-null  int64  
 10  nb_eq                       11430 non-null  int64  
 11  nb_underscore               11430 non-null  int64  
 12  nb_tilde                    11430 non-null  int64  
 13  nb_percent                  114

In [None]:
# Defined the set of special characters to analyze
special_characters = set(string.punctuation)

In [None]:
# Initialized a Counter to count occurrences of each special character
special_char_count = Counter()

In [None]:
# Iterated through each URL and count special characters
for url in urls:
    for char in url:
        if char in special_characters:
            special_char_count[char] += 1

In [None]:
# Converted the counter to a DataFrame for better visualization
special_char_df = pd.DataFrame(special_char_count.items(), columns=['Character', 'Count'])
special_char_df = special_char_df.sort_values(by='Count', ascending=False)

In [None]:
# Displayed the top special characters
print(special_char_df.head())

  Character  Count
1         /  49030
2         .  28354
0         :  11749
3         -  11402
6         _   3688


In [None]:
special_char_df

Unnamed: 0,Character,Count
1,/,49030
2,.,28354
0,:,11749
3,-,11402
6,_,3688
5,=,3351
7,&,1855
4,?,1614
11,%,1407
8,;,712


In [None]:
# List of special characters to tokenize
special_characters = special_char_df['Character'].tolist()

def custom_tokenize(url):
    # Use regex to split on any of the special characters but keep them in the tokens
    pattern = f"([{''.join(re.escape(char) for char in special_characters)}])"
    tokens = re.split(pattern, url)

    # Removed empty strings and return tokens
    return [token for token in tokens if token]

In [None]:
# Splitting the dataset into train and validation sets
train_df, val_df = train_test_split(df[['url', 'labels']], test_size=0.2, random_state=42)


In [None]:
# Initialized the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')



In [None]:
def tokenize_function(examples):
    tokenized_urls = [custom_tokenize(url) for url in examples['url']]
    return tokenizer(tokenized_urls, padding="max_length", truncation=True, max_length=128, is_split_into_words=True)


In [None]:
# Converting the dataframes to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# Tokenizing the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9144 [00:00<?, ? examples/s]

Map:   0%|          | 0/2286 [00:00<?, ? examples/s]

In [None]:

train_dataset

Dataset({
    features: ['url', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 9144
})

In [None]:
# Removed columns that are not needed
train_dataset = train_dataset.remove_columns(['url', '__index_level_0__'])
val_dataset = val_dataset.remove_columns(['url', '__index_level_0__'])

In [None]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 9144
})

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Checked if a GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision if using GPU
)




In [None]:
# Defining a compute metrics function
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    metric = load_metric("accuracy",trust_remote_code=True)
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = metric.compute(predictions=predictions, references=labels)['accuracy']

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Initializing Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# Training the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1032,0.25404,0.942695,0.976145,0.906112,0.939825
2,0.0631,0.239113,0.955381,0.962196,0.946856,0.954464


TrainOutput(global_step=2286, training_loss=0.09102480430302658, metrics={'train_runtime': 172.614, 'train_samples_per_second': 105.947, 'train_steps_per_second': 13.243, 'total_flos': 605640946655232.0, 'train_loss': 0.09102480430302658, 'epoch': 2.0})

In [None]:
trainer.save_model('/content/drive/MyDrive/fine_tuned_model_dis')

In [None]:
model.save_pretrained('/content/drive/MyDrive/fine_tuned_dis')  # Directory where model is saved
tokenizer.save_pretrained('/content/drive/MyDrive//fine_tuned_dis_tok')

('/content/drive/MyDrive//fine_tuned_dis_tok/tokenizer_config.json',
 '/content/drive/MyDrive//fine_tuned_dis_tok/special_tokens_map.json',
 '/content/drive/MyDrive//fine_tuned_dis_tok/vocab.txt',
 '/content/drive/MyDrive//fine_tuned_dis_tok/added_tokens.json')

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

In [None]:
model = DistilBertModel.from_pretrained('/content/drive/MyDrive/fine_tuned_dis').to(device)
tokenizer = DistilBertTokenizer.from_pretrained('/content/drive/MyDrive/fine_tuned_dis_tok')

In [None]:
model.eval()


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [None]:
# Detect if GPU is available and set device accordingly
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [None]:
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [None]:
#def get_url_embedding(url):
    #inputs = tokenizer(url, padding=True, truncation=True, return_tensors='pt', max_length=128).to(device)
   ## with torch.no_grad():
       # outputs = model(**inputs)
        # Extract the last hidden state (sequence of embeddings for each token)
        #hidden_states = outputs.last_hidden_state  # shape: (batch_size, seq_length, hidden_size)
        # Pool the token embeddings (e.g., mean pooling) to get a single vector per URL
        #pooled_output = torch.mean(hidden_states, dim=1)  # shape: (batch_size, hidden_size)
        #return pooled_output.squeeze().cpu().numpy()  # Return the pooled output as a NumPy array

In [None]:
def get_url_embedding(url):
    # Tokenizing the input URL
    inputs = tokenizer(url, padding=True, truncation=True, return_tensors='pt', max_length=128).to(device)


    with torch.no_grad():

        outputs = model(**inputs)
        # Extracting the last hidden state (shape: batch_size, seq_length, hidden_size)
        hidden_states = outputs.last_hidden_state

        # Extracting the hidden state for the [CLS] token (which is the first token in the sequence)
        cls_embedding = hidden_states[:, 0, :]  # shape: (batch_size, hidden_size)

        # Returning the CLS embedding as a NumPy array
        return cls_embedding.squeeze().cpu().numpy()

In [None]:
X_url_embeddings = df['url'].apply(get_url_embedding)  # URL embeddings
X_url_embeddings = pd.DataFrame(X_url_embeddings.tolist())  # into a DataFrame

In [None]:
X_url_embeddings.to_csv('/content/drive/MyDrive/url_embeds_cls.csv', index=False)


In [None]:
X_url_embeddings.to_csv('/content/drive/MyDrive/url_embeds_new.csv', index=False)

In [None]:
X_url_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.469255,-0.405063,0.594913,0.136169,1.582099,-0.483576,0.906208,0.477526,-0.098015,-0.107252,...,0.363200,-0.109043,0.342846,-0.361950,-0.598929,-0.735178,-1.054563,-0.674291,-0.461370,0.044902
1,-0.000928,0.443874,-0.260562,-0.043517,-0.359301,-1.166746,-0.269019,0.111335,0.194590,0.226820,...,0.642944,1.135552,0.104530,0.353447,0.943881,0.600342,1.264430,0.774515,0.062177,0.365771
2,0.167460,0.417528,-0.309965,0.165192,-0.416369,-0.876358,-0.390015,0.086102,0.215878,-0.117253,...,0.549449,0.894037,0.087685,0.188817,0.973427,0.550444,1.179196,0.731847,0.071046,0.090249
3,0.240991,-0.106745,-0.302739,-0.082323,0.747177,-1.286824,0.320509,0.101059,-0.149785,0.566305,...,0.241587,0.056352,0.055448,-0.677768,-0.464772,-0.006799,0.942901,0.476825,-0.227417,0.045641
4,-0.431422,-0.561046,0.177919,-0.464770,1.618210,-0.069161,0.762422,0.302574,-0.566405,0.510891,...,-0.091749,-0.246647,0.013819,-0.423208,-0.558926,-1.015461,-0.778710,-0.900499,-0.059679,-0.576099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,-0.182000,-0.053915,-0.067035,-0.448839,1.083498,0.433607,0.765216,0.368842,-0.918494,0.646444,...,0.211888,-0.308620,0.009435,-0.363927,-0.550364,-0.704651,-0.387404,-0.948580,-0.544716,-0.106422
11426,0.062821,0.583424,-0.066244,-0.226711,-0.395024,-0.883466,-0.755481,-0.071679,0.059975,-0.055863,...,0.764650,0.932202,-0.002972,-0.406380,0.625217,0.360301,0.753569,0.288807,0.197133,0.285982
11427,-0.156803,-0.550709,0.352618,-0.124216,1.139558,0.418037,0.768503,0.175868,-0.789776,0.913601,...,0.318785,-0.211856,0.258788,-0.539019,-0.595462,-0.485886,-0.682009,-0.925361,-0.418217,-0.189013
11428,0.011372,-0.347634,-0.045246,-0.456263,1.409382,0.078471,1.003762,0.528138,-0.822833,0.607301,...,0.168471,-0.117611,0.043892,-0.797820,-0.565127,-0.993448,-0.424178,-0.775399,-0.401752,-0.045945
