<a href="https://colab.research.google.com/github/ashutosh0964/ml_project/blob/main/cybersecurity_ml_malicious_website_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetch dataset
phishing_websites = fetch_ucirepo(id=327)

# Inspect the structure of the data
print(phishing_websites.data.features.shape)  # Shape of the features
print(phishing_websites.data.targets.shape)   # Shape of the targets
print(phishing_websites.variables)            # Variable information


(11055, 30)
(11055, 1)
                          name     role     type demographic description  \
0            having_ip_address  Feature  Integer        None        None   
1                   url_length  Feature  Integer        None        None   
2           shortining_service  Feature  Integer        None        None   
3             having_at_symbol  Feature  Integer        None        None   
4     double_slash_redirecting  Feature  Integer        None        None   
5                prefix_suffix  Feature  Integer        None        None   
6            having_sub_domain  Feature  Integer        None        None   
7               sslfinal_state  Feature  Integer        None        None   
8   domain_registration_length  Feature  Integer        None        None   
9                      favicon  Feature  Integer        None        None   
10                        port  Feature  Integer        None        None   
11                 https_token  Feature  Integer        None     

In [17]:
# Extract feature names and target column
feature_names = phishing_websites.variables[phishing_websites.variables['role'] == 'Feature']['name'].tolist()
target_name = phishing_websites.variables[phishing_websites.variables['role'] == 'Target']['name'].iloc[0]

# Convert features and targets to DataFrame and Series respectively
X = pd.DataFrame(phishing_websites.data.features, columns=feature_names)
y = phishing_websites.data.targets[target_name]  # Correctly extract the target column as a Series

# Display the first few rows to understand the structure
df = pd.concat([X, y], axis=1)
print(df.head())


   having_ip_address  url_length  shortining_service  having_at_symbol  \
0                 -1           1                   1                 1   
1                  1           1                   1                 1   
2                  1           0                   1                 1   
3                  1           0                   1                 1   
4                  1           0                  -1                 1   

   double_slash_redirecting  prefix_suffix  having_sub_domain  sslfinal_state  \
0                        -1             -1                 -1              -1   
1                         1             -1                  0               1   
2                         1             -1                 -1              -1   
3                         1             -1                 -1              -1   
4                         1             -1                  1               1   

   domain_registration_length  favicon  ...  popupwindow  iframe  \


In [18]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import lightgbm as lgb


In [21]:
# Map target values from [-1, 1] to [0, 1]
y_mapped = y.map({-1: 0, 1: 1})
# Split the data using the mapped target values
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.2, random_state=42)


In [22]:
# Gradient Boosting Model
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)
print("GBM Model")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# XGBoost Model
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print("XGBoost Model")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# LightGBM Model
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
print("LightGBM Model")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Random Forest Model")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


GBM Model
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       956
           1       0.95      0.96      0.96      1255

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211

Accuracy: 0.9507010402532791
XGBoost Model
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       956
           1       0.96      0.98      0.97      1255

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211

Accuracy: 0.9706015377657169
[LightGBM] [Info] Number of positive: 4902, number of negative: 3942
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 