In [None]:
import pandas as pd
import numpy as np
import re
import math
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("malicious_phish.csv")
print("Dataset shape:", df.shape)
print("Class Distribution:\n", df['type'].value_counts())





Dataset shape: (651191, 2)
Class Distribution:
 type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [None]:
suspicious_keywords = [
    'login', 'secure', 'account', 'update', 'free', 'verify',
    'banking', 'paypal', 'signin', 'confirm', 'password'
]

def calculate_entropy(string):
    prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
    return - sum([p * math.log(p, 2) for p in prob])

def extract_features(url):
    features = {}
    features['url_length'] = len(url)
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_special_chars'] = len(re.findall(r'[^A-Za-z0-9]', url))
    features['num_subdirs'] = url.count('/')
    features['num_dots'] = url.count('.')
    features['has_https'] = 1 if "https" in url else 0
    features['has_ip'] = 1 if re.search(r'(\d{1,3}\.){3}\d{1,3}', url) else 0
    features['entropy'] = calculate_entropy(url)
    features['tld_length'] = len(url.split('.')[-1]) if '.' in url else 0

    for word in suspicious_keywords:
        features[f'keyword_{word}'] = 1 if word in url.lower() else 0
    return features

feature_list = df['url'].apply(extract_features)
X = pd.DataFrame(list(feature_list))
y = df['type']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_model = LogisticRegression(max_iter=5000, solver="lbfgs", class_weight="balanced")
lr_model.fit(X_train_scaled, y_train)

dt_model = DecisionTreeClassifier(class_weight="balanced", max_depth=20, random_state=42)
dt_model.fit(X_train, y_train)  # Decision Tree does not need scaling

print("\nModels trained successfully!")

joblib.dump(lr_model, "logistic_model.pkl")
joblib.dump(dt_model, "decision_tree_model.pkl")
joblib.dump(le, "label_encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "feature_list.pkl")


def predict_url(url):

    lr = joblib.load("logistic_model.pkl")
    dt = joblib.load("decision_tree_model.pkl")
    le = joblib.load("label_encoder.pkl")
    scaler = joblib.load("scaler.pkl")
    feature_names = joblib.load("feature_list.pkl")

    user_features = pd.DataFrame([extract_features(url)])[feature_names]

    user_features_scaled = scaler.transform(user_features)

    lr_probs = lr.predict_proba(user_features_scaled)[0]
    dt_probs = dt.predict_proba(user_features)[0]

    avg_probs = (lr_probs + dt_probs) / 2
    final_class = le.inverse_transform([np.argmax(avg_probs)])[0]
    confidence = np.max(avg_probs) * 100

    if final_class == "benign":
        print(f"\n{url} → SAFE LINK (Confidence: {confidence:.2f}%)")
    else:
        print(f"\n{url} → NOT SAFE (Confidence: {confidence:.2f}%)")
        print(f"   Predicted Class → {final_class}")
        print(f"   Logistic Regression → {le.inverse_transform([np.argmax(lr_probs)])[0]}")
        print(f"   Decision Tree       → {le.inverse_transform([np.argmax(dt_probs)])[0]}")





Models trained successfully!


In [None]:
user_url = input("\nEnter a URL to check: ")
predict_url(user_url)


Enter a URL to check: http://mys.co.om

http://mys.co.om → NOT SAFE (Confidence: 85.48%)
   Predicted Class → phishing
   Logistic Regression → phishing
   Decision Tree       → phishing


In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

# Logistic Regression Evaluation
y_pred_lr = lr_model.predict(X_test_scaled)
print("\n=== Logistic Regression Evaluation ===")
print("F1 Score:", f1_score(y_test, y_pred_lr, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, target_names=le.classes_))

# Decision Tree Evaluation
y_pred_dt = dt_model.predict(X_test)
print("\n=== Decision Tree Evaluation ===")
print("F1 Score:", f1_score(y_test, y_pred_dt, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt, target_names=le.classes_))



=== Logistic Regression Evaluation ===
F1 Score: 0.6510487753367901
Confusion Matrix:
 [[50364 10561  2531 22165]
 [ 2135 14257   658  2242]
 [  338   922  4939   305]
 [ 3595  1648  1325 12254]]
Classification Report:
               precision    recall  f1-score   support

      benign       0.89      0.59      0.71     85621
  defacement       0.52      0.74      0.61     19292
     malware       0.52      0.76      0.62      6504
    phishing       0.33      0.65      0.44     18822

    accuracy                           0.63    130239
   macro avg       0.57      0.68      0.59    130239
weighted avg       0.74      0.63      0.65    130239


=== Decision Tree Evaluation ===
F1 Score: 0.892066067638237
Confusion Matrix:
 [[76059  2749   524  6289]
 [  428 18328   152   384]
 [   94   102  6013   295]
 [ 2090  1045   292 15395]]
Classification Report:
               precision    recall  f1-score   support

      benign       0.97      0.89      0.93     85621
  defacement       0.