In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('uci-ml-phishing-dataset.csv')

In [5]:
df.head()

Unnamed: 0,id,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [7]:
df.isna().sum()

id                             0
having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       0
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistica

In [9]:
df['Result'].value_counts()

Result
 1    6157
-1    4898
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
le = LabelEncoder()

In [12]:
x = df.drop(columns=['Result'])
y = df['Result']

In [13]:
x.shape

(11055, 31)

In [14]:
y.shape

(11055,)

In [15]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=.1,random_state=23)

In [17]:
print(f'{xtrain.shape} {ytrain.shape} {xtest.shape} {ytest.shape}')

(9949, 31) (9949,) (1106, 31) (1106,)


In [25]:
ytrain =  le.fit_transform(ytrain)
ytest  =le.transform(ytest)

In [26]:
from xgboost import XGBClassifier

In [27]:

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

In [28]:
model.fit(xtrain,ytrain)

In [29]:
from sklearn.metrics import classification_report,confusion_matrix,r2_score

In [31]:
ypre = model.predict(xtest)

In [32]:
print(classification_report(ytest,ypre))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96       504
           1       0.96      0.98      0.97       602

    accuracy                           0.97      1106
   macro avg       0.97      0.97      0.97      1106
weighted avg       0.97      0.97      0.97      1106



In [33]:
print(confusion_matrix(ytest,ypre))

[[481  23]
 [ 12 590]]


## Parameter Tunning

In [42]:
from sklearn.model_selection import GridSearchCV

In [43]:
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)


In [44]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}


In [45]:
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='recall',
    cv=5,
    n_jobs=-1,
    verbose=2
)


In [46]:
grid.fit(xtrain,ytrain)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [47]:
print(grid.best_estimator_)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, ...)


In [48]:
print("Best Parameters:", grid.best_params_)
print("Best Recall Score:", grid.best_score_)


Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
Best Recall Score: 0.9733573357335734


In [51]:
finalModel = grid.best_estimator_

In [53]:
print(classification_report(ytest,finalModel.predict(xtest)))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       504
           1       0.96      0.99      0.98       602

    accuracy                           0.97      1106
   macro avg       0.98      0.97      0.97      1106
weighted avg       0.97      0.97      0.97      1106



In [54]:
import joblib

joblib.dump(finalModel, "xgboost_phishing_model.pkl")


['xgboost_phishing_model.pkl']