In [3]:
!pip install tld


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import joblib

def extract_features(url):
    parsed_url = urlparse(url)
    
    return [
        len(url),  # URL Length
        url.count('.'),  # Number of dots
        url.count('-'),  # Hyphens in URL
        url.count('@'),  # '@' symbol presence
        url.count('?'),  # Query parameters
        url.count('='),  # Equal signs
        url.count('&'),  # Ampersands
        sum(c.isdigit() for c in url) / len(url),  # Digit-to-length ratio
        len(parsed_url.netloc.split('.')) - 2 if parsed_url.netloc else 0,  # Subdomains count
        1 if parsed_url.scheme == 'https' else 0  # HTTPS usage
    ]

# Load dataset (Assuming CSV has columns: 'url', 'label')
data = pd.read_csv('dataset.csv')

# Check for class imbalance
data_distribution = data['label'].value_counts(normalize=True)
print("Class Distribution:\n", data_distribution)

# Split dataset before feature extraction
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

# Extract features
train_data['features'] = train_data['URL'].apply(extract_features)
test_data['features'] = test_data['URL'].apply(extract_features)

features_train = pd.DataFrame(train_data['features'].to_list())
features_test = pd.DataFrame(test_data['features'].to_list())
labels_train = train_data['label']
labels_test = test_data['label']

# Convert categorical URL into numerical values using TF-IDF with reduced features
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=5000)
url_vectorized_train = vectorizer.fit_transform(train_data['URL'])
url_vectorized_test = vectorizer.transform(test_data['URL'])

# Combine extracted features with TF-IDF vectors using sparse matrices
X_train = hstack((features_train.to_numpy(), url_vectorized_train))
X_test = hstack((features_test.to_numpy(), url_vectorized_test))

# Scale features
scaler = StandardScaler(with_mean=False)  # Ensure compatibility with sparse matrices
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(C=1, penalty='l2', solver='liblinear', class_weight='balanced')

# Use cross-validation to check performance
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, labels_train, cv=cv, scoring='f1')
print("Cross-Validation F1 Scores:", scores)
print("Mean F1 Score:", np.mean(scores))

# Fit the model
model.fit(X_train_scaled, labels_train)

# Evaluate model
y_pred = model.predict(X_test_scaled)
print(classification_report(labels_test, y_pred))



Class Distribution:
 label
1    0.571895
0    0.428105
Name: proportion, dtype: float64
Cross-Validation F1 Scores: [0.99571273 0.99589927 0.99573699 0.99548182 0.99608443]
Mean F1 Score: 0.9957830474521963
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     20189
           1       1.00      1.00      1.00     26970

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



In [3]:
joblib.dump(model, "D:/phishing_detection/PhishTrap_UI/MODELS/phishing_model_url.pkl")
joblib.dump(scaler, 'D:/phishing_detection/PhishTrap_UI/MODELS/scaler.pkl')
joblib.dump(vectorizer, 'D:/phishing_detection/PhishTrap_UI/MODELS/vectorizer.pkl')

['D:/phishing_detection/PhishTrap_UI/MODELS/vectorizer.pkl']