In [26]:
import pandas as pd
import numpy as np
import re
import tldextract
from scipy.stats import entropy
from collections import Counter


In [27]:
# Function to calculate entropy of a string
def calculate_entropy(text):
    p, lens = Counter(text), np.float64(len(text))
    return entropy(list(p.values()), base=2) / lens if lens else 0

In [28]:
# Function to calculate character continuity rate
def character_continuity_rate(domain):
    letter_blocks = re.findall(r'[a-zA-Z]+', domain)
    digit_blocks = re.findall(r'[0-9]+', domain)
    symbol_blocks = re.findall(r'[^a-zA-Z0-9]+', domain)
    
    longest_letter = max(map(len, letter_blocks), default=0)
    longest_digit = max(map(len, digit_blocks), default=0)
    longest_symbol = max(map(len, symbol_blocks), default=0)
    
    return (longest_letter + longest_digit + longest_symbol) / max(len(domain), 1)

In [29]:
# Function to extract URL features
def extract_features(url):
    extracted = tldextract.extract(url)
    domain = extracted.domain
    suffix = extracted.suffix
    path = url.split('/')[3:]  # Everything after domain
    path_str = '/'.join(path)
    
    # Length Features
    url_len = len(url)
    domain_len = len(domain)
    path_len = len(path_str)
    
    # Entropy
    domain_entropy = calculate_entropy(domain)
    suffix_entropy = calculate_entropy(suffix)
    
    # Character Continuity Rate
    continuity_rate = character_continuity_rate(domain)
    
    # Count of special characters, digits, and tokens
    symbol_count = len(re.findall(r'[:/.?=&]', url))
    token_count = len(re.split(r'[-._]', domain))  # Tokens in domain
    digit_count = sum(c.isdigit() for c in url)
    
    # Length Ratios
    path_url_ratio = path_len / max(url_len, 1)
    domain_url_ratio = domain_len / max(url_len, 1)
    
    return {
        "url_len": url_len,
        "domain_len": domain_len,
        "path_len": path_len,
        "domain_entropy": domain_entropy,
        "suffix_entropy": suffix_entropy,
        "continuity_rate": continuity_rate,
        "symbol_count": symbol_count,
        "token_count": token_count,
        "digit_count": digit_count,
        "path_url_ratio": path_url_ratio,
        "domain_url_ratio": domain_url_ratio,
    }

In [30]:
# Load dataset
file_path = "./data/merged/cleaned_balanced_data.csv" 
df = pd.read_csv(file_path)

# Extract features for each URL
df_features = df['url'].apply(extract_features).apply(pd.Series)

# Merge with original dataset
df_final = pd.concat([df, df_features], axis=1)

In [31]:

# Save the processed dataset
df_final.to_csv("./data/merged/processed_urls.csv", index=False)

print("Feature extraction complete. Saved as 'processed_urls.csv'.")

Feature extraction complete. Saved as 'processed_urls.csv'.
