A dataset to classify Phishing URL is given. There are multiple features of categorical/strings, integer, float types. The target is given as label in the last column. Write block of codes to develop ML classifiers using support vector machines, KNN,decision tree, logistic regression algorithms. Compare the accuracy, precision and recall values for different models. Optimize the hyperparameters to get highest accuracy for each of the models. Use 30% of the data for performance evaluation. Use same dataset to train and test all the models. Find out the most important features for this problem. You may exclude url, domain and title features (as those are long strings).


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_excel('/content/drive/MyDrive/Copy of phishing_url.xlsx')

# Print the columns of the DataFrame to check their exact names
print(df.columns)

# Exclude unnecessary features, make sure column names match exactly
# Adjust the column names below if they are different in your DataFrame
columns_to_drop = ['url', 'domain', 'title']
# Check if all columns to drop are in the dataframe
columns_to_drop = [col for col in columns_to_drop if col in df.columns]

df = df.drop(columns=columns_to_drop)

# Encode categorical features if any
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Separate features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features for models sensitive to feature scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models with default parameters
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Hyperparameters for tuning
param_grids = {
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'KNN': {'n_neighbors': [3, 5, 7, 9]},
    'Decision Tree': {'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy']},
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'XGBoost': {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [100, 200]}
}

# Evaluate models and optimize hyperparameters
results = {}

for model_name, model in models.items():
    print(f"Training and optimizing {model_name}...")
    grid = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    results[model_name] = {
        'Best Parameters': grid.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'Classification Report': classification_report(y_test, y_pred)
    }

# Display model performance
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"Best Parameters: {metrics['Best Parameters']}")
    print(f"Accuracy: {metrics['Accuracy']:.2f}")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print("Classification Report:")
    print(metrics['Classification Report'])

# Feature importance analysis (for models that support it)
xgb_model = models['XGBoost']
xgb_model.fit(X_train, y_train)
feature_importances = pd.Series(xgb_model.feature_importances_, index=df.columns[:-1]).sort_values(ascending=False)

print("\nFeature Importances (XGBoost):")
print(feature_importances)


Index(['URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP', 'TLD',
       'URLSimilarityIndex', 'CharContinuationRate', 'TLDLegitimateProb',
       'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation',
       'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL',
       'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL',
       'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL',
       'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS',
       'LineOfCode', 'LargestLineLength', 'HasTitle', 'Title',
       'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon', 'Robots',
       'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription',
       'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet',
       'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay',
       'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS',
       'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef', 'labe