<a href="https://colab.research.google.com/github/Yiting916/GCP/blob/main/BERT_URL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **於Colab上使用Hugging Face的BERT Model 偵測釣魚網址**

# 0. 環境設定 (Google Colab)

In [1]:
!pip install transformers datasets scikit-learn torch pandas

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

# 1. 下載並載入 Phishing URL 數據集

In [2]:
from google.colab import files
import pandas as pd

# 上傳 CSV 檔案
uploaded = files.upload()

# 讀取 CSV 檔案（確保檔案名稱正確）
file_path = "/content/PhiUSIIL_Phishing_URL_Dataset.csv"
df = pd.read_csv(file_path)

# 檢查數據結構
df.head()
df.info()
df['label'].value_counts()  # 查看標籤分佈


Saving PhiUSIIL_Phishing_URL_Dataset.csv to PhiUSIIL_Phishing_URL_Dataset.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235795 entries, 0 to 235794
Data columns (total 55 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   URL                         235795 non-null  object 
 1   URLLength                   235795 non-null  int64  
 2   Domain                      235795 non-null  object 
 3   DomainLength                235795 non-null  int64  
 4   IsDomainIP                  235795 non-null  int64  
 5   TLD                         235795 non-null  object 
 6   URLSimilarityIndex          235795 non-null  float64
 7   CharContinuationRate        235795 non-null  float64
 8   TLDLegitimateProb           235795 non-null  float64
 9   URLCharProb                 235795 non-null  float64
 10  TLDLength                   235795 non-null  int64  
 11  NoOfSubDomain               235795 non-null  int64  

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,134850
0,100945


# 2. 資料集分割

In [3]:
from sklearn.model_selection import train_test_split

# 選擇 'URL' 和 'label' 欄位
df = df[['URL', 'label']].dropna()

# 70% 訓練集, 30% 測試集
train_df, test_df = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df['label']
)

# 測試集再拆分為 50% 測試集 & 50% 驗證集
val_df, test_df = train_test_split(
    test_df, test_size=0.5, random_state=42, stratify=test_df['label']
)

# **🚀 這裡加入: 減少訓練集、驗證集、測試集至 1/1000**
#實際訓練極度不建議
train_df = train_df.sample(frac=0.001, random_state=42)
val_df = val_df.sample(frac=0.001, random_state=42)
test_df = test_df.sample(frac=0.001, random_state=42)
# 顯示各集合大小
print(f"訓練集: {len(train_df)}, 驗證集: {len(val_df)}, 測試集: {len(test_df)}")



訓練集: 165, 驗證集: 35, 測試集: 35


# 3. BERT Tokenizer 處理

In [4]:
from transformers import BertTokenizer

# 載入 BERT 預訓練 Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization 方法
def tokenize_function(examples):
    return tokenizer(
        examples['URL'], padding="max_length", truncation=True, max_length=512
    )

# 對數據集進行 Tokenization
train_encodings = tokenizer(list(train_df['URL']), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_df['URL']), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_df['URL']), truncation=True, padding=True, max_length=128)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# 4. 轉換為 PyTorch Dataset

In [5]:
import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 轉換標籤
train_labels = train_df['label'].tolist()
val_labels = val_df['label'].tolist()
test_dataset=test_df['label'].tolist()

# 建立 Dataset
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
test_dataset = TextDataset(test_encodings, test_dataset)


# 5. 設定 BERT 模型

In [6]:
from transformers import BertForSequenceClassification

# 載入 BERT 分類模型（適用於二元分類）
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 6. 訓練模型

In [7]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support

# 啟用 Gradient Checkpointing 以降低記憶體使用
model.gradient_checkpointing_enable()
# 設定訓練參數
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    per_device_train_batch_size=1,#原32因訓練演示改為1
    per_device_eval_batch_size=1,#原32因訓練演示改為1
    num_train_epochs=1,#原2因訓練演示改為1
    weight_decay=0.01,
    report_to="none",
    fp16=False,
    logging_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,
    save_steps=500
)

# 計算評估指標
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

# 建立 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 禁用 W&B 日誌
import os
os.environ["WANDB_MODE"] = "disabled"

# 開始訓練
trainer.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5642,0.347791,0.952381,0.909091,0.930233


TrainOutput(global_step=165, training_loss=0.5641774957830256, metrics={'train_runtime': 567.9782, 'train_samples_per_second': 0.291, 'train_steps_per_second': 0.291, 'total_flos': 10853331033600.0, 'train_loss': 0.5641774957830256, 'epoch': 1.0})

# 7. 評估模型

In [8]:
from torch.utils.data import Subset

# 取 1/1000 測試樣本
num_samples = max(35, 1)  # 確保至少有 1 個樣本
test_dataset = Subset(test_dataset, range(num_samples))

# 確認新的測試集大小
print(f"新的測試集大小: {len(test_dataset)}")  # 預期輸出: 35

# 在測試集上評估
test_results = trainer.evaluate(test_dataset)
print(test_results)


新的測試集大小: 35


{'eval_loss': 0.24600811302661896, 'eval_precision': 0.9545454545454546, 'eval_recall': 0.9545454545454546, 'eval_f1': 0.9545454545454546, 'eval_runtime': 7.7954, 'eval_samples_per_second': 4.49, 'eval_steps_per_second': 4.49, 'epoch': 1.0}


# 8. 儲存模型

In [9]:
# 儲存模型和 tokenizer
save_directory = "./trained_model"

trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

# 也可以單獨儲存模型權重
import torch
torch.save(model.state_dict(), './results/pytorch_model.bin')
