In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import re
from urllib.parse import urlparse

df = pd.read_csv("dataset.csv")

df['url_length'] = df['URL'].apply(len)
df['num_special_chars'] = df['URL'].apply(lambda x: len(re.findall(r'[@%&\*\$#\?\+\!]', x)))
df['has_https'] = df['URL'].apply(lambda x: 1 if 'https://' in x else 0)
df['num_digits'] = df['URL'].apply(lambda x: sum(c.isdigit() for c in x))
df['num_subdomains'] = df['URL'].apply(lambda x: urlparse(x).netloc.count('.'))

X = df[['url_length', 'num_special_chars', 'has_https', 'num_digits', 'num_subdomains']]
y = df['label']  # Label column with 'phishing' or 'non-phishing'

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)


print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.89      0.93     20124
           1       0.92      0.98      0.95     27035

    accuracy                           0.94     47159
   macro avg       0.95      0.94      0.94     47159
weighted avg       0.95      0.94      0.94     47159



In [2]:
import joblib

joblib.dump(logreg, 'phishing_model_url.pkl')
print("Model saved successfully!")
joblib.dump(scaler, "scaler_url.pkl")

Model saved successfully!


['scaler_url.pkl']