In [None]:
# https://dou.ua/forums/topic/56830/
# https://www.kaggle.com/datasets/siddharthkumar25/malicious-and-benign-urls?select=urldata.csv

In [2]:
# Завантаження датасету у Dataframe (url, result)

import pandas as pd

df = pd.read_csv("urldata.csv")
if "url" not in df.columns or "result" not in df.columns:
    raise SystemExit("CSV must contain 'url' and 'result' columns.")

df["result"] = df["result"].astype(int)

In [4]:
# Нормалізація URL і безпечний парсинг

import re
from urllib.parse import urlparse
import pandas as pd

_SCHEME_RE = re.compile(r'^[a-zA-Z][a-zA-Z0-9+.\-]*://')

def safe_str(u):
    """Безпечно конвертувати значення в рядок, обробляючи None та NaN."""
    return "" if u is None or (isinstance(u, float) and pd.isna(u)) else str(u)

def normalize_url(u: str) -> str:
    "Нормалізує та перевіряє рядок URL-адреси."
    if u is None or (isinstance(u, float) and pd.isna(u)):
        return ""

    s = str(u).strip().strip('"\'')
    if not s:
        return ""

    if s.startswith("//"):
        s = "http:" + s

    if not _SCHEME_RE.match(s):
        s = "http://" + s

    return s

def safe_urlparse(u: str):
    """Повертає (scheme, netloc, path, query) і не кидає винятків."""
    s = normalize_url(u)
    try:
        p = urlparse(s)
        scheme, netloc, path, query = p.scheme or "", p.netloc or "", p.path or "", p.query or ""

        if not netloc and path:
            first, _, rest = path.lstrip("/").partition("/")
            if "." in first and " " not in first:
                netloc = first
                path = "/" + rest if rest else ""

        return scheme, netloc, path, query
    except Exception:
        return "", "", "", ""

In [6]:
# Побудова ознак

import re
from tld import get_tld

sstr = lambda i: safe_str(i)

FEATURE_COLS = [
    "hostname_length", "path_length", "fd_length", "tld_length",
    "count-", "count@", "count?", "count%", "count.",
    "count=", "count-www", "count-digits", "count-letters",
    "count_dir", "use_of_ip", "short_url"
]

def having_ip_address(url):
    s = sstr(url)
    match = re.search(
        r"(([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\."
        r"([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\/)|"
        r"((0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\/)|"
        r"(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}",
        s,
    )
    return 1 if match else 0

def fd_length(url):
    path = safe_urlparse(url)[2]
    parts = [p for p in path.split("/") if p]
    return len(parts[0]) if parts else 0

def shortening_service(url):
    s = sstr(url).lower()
    match = re.search(
        r"(bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|"
        r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|"
        r"short\.to|budurl\.com|ping\.fm|post\.ly|just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|"
        r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|lnkd\.in|"
        r"db\.tt|qr\.ae|adf\.ly|bitly\.com|cur\.lv|q\.gs|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|"
        r"buzurl\.com|cutt\.us|u\.bb|yourls\.org|prettylinkpro\.com|scrnch\.me|filoops\.info|"
        r"vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|link\.zip\.net)",
        s,
    )
    return 1 if match else 0

def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df["url_length"] = df["url"].apply(lambda i: len(safe_str(i)))
    df["hostname_length"] = df["url"].apply(lambda i: len(safe_urlparse(i)[1]))
    df["path_length"]     = df["url"].apply(lambda i: len(safe_urlparse(i)[2]))
    df["fd_length"]       = df["url"].apply(fd_length)

    df["tld"] = df["url"].apply(lambda i: get_tld(normalize_url(i), fail_silently=True))
    df["tld_length"] = df["tld"].apply(lambda t: len(t) if isinstance(t, str) and t else -1)
    df.drop(columns=["tld"], inplace=True)

    # sstr = lambda i: safe_str(i)
    df["count-"]      = df["url"].apply(lambda i: sstr(i).count("-"))
    df["count@"]      = df["url"].apply(lambda i: sstr(i).count("@"))
    df["count?"]      = df["url"].apply(lambda i: sstr(i).count("?"))
    df["count%"]      = df["url"].apply(lambda i: sstr(i).count("%"))
    df["count."]      = df["url"].apply(lambda i: sstr(i).count("."))
    df["count="]      = df["url"].apply(lambda i: sstr(i).count("="))
    df["count-www"]   = df["url"].apply(lambda i: sstr(i).lower().count("www"))
    df["count-digits"]  = df["url"].apply(lambda i: sum(ch.isdigit() for ch in sstr(i)))
    df["count-letters"] = df["url"].apply(lambda i: sum(ch.isalpha() for ch in sstr(i)))
    df["count_dir"]     = df["url"].apply(lambda i: safe_urlparse(i)[2].count("/"))
    df["use_of_ip"]     = df["url"].apply(having_ip_address)
    df["short_url"]     = df["url"].apply(shortening_service)

    return df

df = build_features(df)
missing = [c for c in FEATURE_COLS if c not in df.columns]

if missing:
    raise SystemExit(f"Missing feature columns after feature engineering: {missing}")

X = df[FEATURE_COLS]
y = df["result"]

In [7]:
# Розподіл даних на train/test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

In [8]:
# Навчання моделей: DecisionTree, LogisticRegression, RandomForest

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

dt_model  = DecisionTreeClassifier(
    random_state=42
)

base_rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample",
)

rf_model = CalibratedClassifierCV(
    base_rf,
    cv=3,
    method="sigmoid"
)

log_model = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    class_weight="balanced"
)

In [None]:
# Оцінка якості (Accuracy, Precision/Recall/F1, ROC-AUC)

import os, json, joblib
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

outdir = "model_saved"
os.makedirs(outdir, exist_ok=True)

def train_eval(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    try:
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None
    except:
        auc = None

    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print("Report:\n", classification_report(y_test, y_pred, digits=3))

    if auc is not None:
        print("ROC-AUC:", round(auc, 4))

    out_path = os.path.join(outdir, f"{name}.joblib")
    joblib.dump(model, out_path)
    print(f"Saved: {out_path}")

train_eval("decision_tree", dt_model)
train_eval("random_forest", rf_model)
train_eval("log_regression", log_model)

with open(os.path.join(outdir, "features.json"), "w", encoding="utf-8") as f:
    json.dump({"feature_cols": FEATURE_COLS}, f, ensure_ascii=False, indent=2)


=== decision_tree ===
Accuracy: 0.973351202861099
Confusion matrix:
 [[101941   1781]
 [  1818  29513]]
Report:
               precision    recall  f1-score   support

           0      0.982     0.983     0.983    103722
           1      0.943     0.942     0.943     31331

    accuracy                          0.973    135053
   macro avg      0.963     0.962     0.963    135053
weighted avg      0.973     0.973     0.973    135053

ROC-AUC: 0.9622
Saved: model_saved\decision_tree.joblib


In [None]:
# Підсумкова оцінка

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))