In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Dataset
df = pd.read_csv("./data/merged/processed_urls.csv") 
urls = df["url"].astype(str).tolist()

In [3]:
# Load Pretrained Transformer Model
MODEL_NAME = "bert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


cuda


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
# Function to Get Transformer Embeddings
def get_embedding(url):
    inputs = tokenizer(url, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move to GPU
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract last hidden state (CLS token embedding)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return embedding

In [5]:
# Extract Embeddings
embeddings = []
for url in tqdm(urls, desc="Extracting Embeddings"):
    embeddings.append(get_embedding(url))

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Extracting Embeddings: 100%|██████████| 59605/59605 [18:14<00:00, 54.46it/s]


In [6]:
# Convert to DataFrame and Save
embeddings_df = pd.DataFrame(embeddings)
embeddings_df["type"] = df["type"]  # Add label column back
embeddings_df.to_csv("./data/merged/url_embeddings.csv", index=False)

print("URL embeddings saved")


URL embeddings saved
