In [1]:
import pandas as pd
Phishing_Web = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\dataset_phishing.csv")
Phishing_Web.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [2]:
X = Phishing_Web.drop(["url","status"],axis=1)
y = Phishing_Web["status"]
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy


0.9593175853018373

In [3]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.96      0.96      0.96      1157
    phishing       0.96      0.96      0.96      1129

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286



In [4]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
scores.mean()

0.9563651742591102

Test and cross validation accuracy scores are almost 96% while the f1 score for both categories of urls, phishing and legitimate, is 96%. Let's do feature importance now to see which features are the most important. 

In [12]:
pd.set_option('display.max_rows',None)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
85,google_index,0.581102
86,page_rank,0.128265
56,nb_hyperlinks,0.087621
20,nb_www,0.04541
83,web_traffic,0.026158
82,domain_age,0.018047
6,nb_qm,0.015355
50,phish_hints,0.01365
46,longest_word_path,0.009929
4,nb_hyphens,0.007792


Based on this, the top six features explain approximately 89% of the model. Let's rerun the model with just these features.

In [5]:
X = Phishing_Web[["google_index","page_rank","nb_hyperlinks","nb_www","web_traffic","domain_age"]]
y = Phishing_Web["status"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9348206474190727

In [6]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.93      0.94      0.94      1157
    phishing       0.94      0.93      0.93      1129

    accuracy                           0.93      2286
   macro avg       0.93      0.93      0.93      2286
weighted avg       0.93      0.93      0.93      2286



In [7]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
scores.mean()

0.9387577720005791

The test and cross validation accuracy scores are now approximately 93% and 94% respectively. The f1 score for legitimate urls is slightly higher (94%) than for phishing urls (93%). While these results are still pretty good, this indicates that to get optimal results, it is better to use the full feature set. Now let's do GridSearch with the full feature set. 

In [21]:
X = Phishing_Web.drop(["url","status"],axis=1)
y = Phishing_Web["status"]
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
         'subsample': [0.8, 1.0]
    }
from sklearn.model_selection import GridSearchCV
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1) 
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
best_gb_model = grid_search.best_estimator_
y_pred = best_gb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy with best model: {test_accuracy:.4f}")
report = classification_report(y_test,y_pred)
print(report)



Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best cross-validation accuracy: 0.9518
Test set accuracy with best model: 0.9471
              precision    recall  f1-score   support

  legitimate       0.95      0.95      0.95      1157
    phishing       0.95      0.95      0.95      1129

    accuracy                           0.95      2286
   macro avg       0.95      0.95      0.95      2286
weighted avg       0.95      0.95      0.95      2286



No real improvement with Grid Search. 

In [8]:
import pandas as pd
Email = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\Phishing_Email.csv")
Email.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,num_chars,num_words,avg_word_len,num_sentences,num_capitals,num_exclamations,num_question_marks,num_special_chars,num_digits,num_urls
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,1030.0,230.0,3.482609,9.0,0.0,2.0,0.0,59.0,9.0,0.0
1,1,the other side of * galicismos * * galicismo *...,Safe Email,479.0,91.0,4.274725,6.0,0.0,0.0,2.0,16.0,0.0,0.0
2,2,re : equistar deal tickets are you still avail...,Safe Email,1245.0,305.0,3.085246,7.0,0.0,0.0,1.0,95.0,63.0,0.0
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,688.0,96.0,5.5,38.0,39.0,1.0,1.0,110.0,29.0,1.0
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,441.0,91.0,3.857143,13.0,0.0,0.0,0.0,27.0,2.0,0.0


In [9]:
X = Email.drop(["Unnamed: 0","Email Text","Email Type"],axis=1)
y = Email["Email Type"]
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.802765237020316

In [10]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.81      0.67      0.73      1429
    Safe Email       0.80      0.89      0.84      2115

      accuracy                           0.80      3544
     macro avg       0.80      0.78      0.79      3544
  weighted avg       0.80      0.80      0.80      3544



In [11]:

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
scores.mean()

0.8152689376542588

The test and cross validation accuracy scores are approximately 80% and 82% respectively. There is a significant disparity between f1 scores for phishing emails (73%) and safe emails (84%), something that isn't surprising due to class imbalance favoring the latter. Let's do feature importance now. 

In [12]:
pd.set_option('display.max_rows',None)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
5,num_exclamations,0.407513
7,num_special_chars,0.129311
8,num_digits,0.09022
9,num_urls,0.084875
3,num_sentences,0.080571
2,avg_word_len,0.072299
4,num_capitals,0.067309
1,num_words,0.032035
0,num_chars,0.019804
6,num_question_marks,0.016062


The top 6 features explain approximately 87% of the model. Let's rerun the model with these features. 

In [13]:
X = Email[["num_exclamations","num_special_chars","num_digits","num_urls","num_sentences","avg_word_len"]]
y = Email["Email Type"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7920428893905191

In [14]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.79      0.66      0.72      1429
    Safe Email       0.79      0.88      0.83      2115

      accuracy                           0.79      3544
     macro avg       0.79      0.77      0.78      3544
  weighted avg       0.79      0.79      0.79      3544



In [15]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
scores.mean()

0.8032032052215538

The results for test and cross validation accuracy as well as f1 scores are lower but not drastically so. However, it is best to use the full feature set for best performance. Let's do Grid Search now. 

In [14]:
X = Email.drop(["Unnamed: 0","Email Text","Email Type"],axis=1)
y = Email["Email Type"]
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
         'subsample': [0.8, 1.0]
    }
from sklearn.model_selection import GridSearchCV
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1) 
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
best_gb_model = grid_search.best_estimator_
y_pred = best_gb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy with best model: {test_accuracy:.4f}")
report = classification_report(y_test,y_pred)
print(report)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best cross-validation accuracy: 0.8314
Test set accuracy with best model: 0.8307
                precision    recall  f1-score   support

Phishing Email       0.80      0.77      0.79      1429
    Safe Email       0.85      0.87      0.86      2115

      accuracy                           0.83      3544
     macro avg       0.83      0.82      0.82      3544
  weighted avg       0.83      0.83      0.83      3544



The test and cross validation accuracy scores are better. Both are approximately 83% now. The f1 scores for phishing emails (79%) and for safe emails (86%) are better as well. 

In [16]:
import pandas as pd
Phishing_URL = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\web-page-phishing.csv")
Phishing_URL.head()

Unnamed: 0,url_length,n_dots,n_hypens,n_underline,n_slash,n_questionmark,n_equal,n_at,n_and,n_exclamation,n_space,n_tilde,n_comma,n_plus,n_asterisk,n_hastag,n_dollar,n_percent,n_redirection,phishing
0,37,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,77,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,126,4,1,2,0,1,3,0,2,0,0,0,0,0,0,0,0,0,1,1
3,18,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,55,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [17]:
X = Phishing_URL.drop("phishing",axis=1)
y = Phishing_URL["phishing"]
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8829936051159073

In [18]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     12698
           1       0.84      0.84      0.84      7318

    accuracy                           0.88     20016
   macro avg       0.87      0.87      0.87     20016
weighted avg       0.88      0.88      0.88     20016



In [19]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
scores.mean()

0.8830017295309289

Test and cross validation accuracy scores are approximately 88%. The f1 score for non-phishing urls (91%) is higher than that of phishing urls (84%).Let's do feature importance now. 

In [20]:
pd.set_option('display.max_rows',None)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
0,url_length,0.599381
4,n_slash,0.299354
1,n_dots,0.036618
2,n_hypens,0.02628
5,n_questionmark,0.009977
18,n_redirection,0.007659
17,n_percent,0.006373
3,n_underline,0.005941
6,n_equal,0.005246
7,n_at,0.00159


The top two features explain approximately 90% of the model. Let's rerun the model with just these features. 

In [21]:
X = Phishing_URL[["url_length","n_slash"]]
y = Phishing_URL["phishing"]
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)
model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8579636290967226

In [22]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.88      0.89      0.89     12698
           1       0.81      0.79      0.80      7318

    accuracy                           0.86     20016
   macro avg       0.85      0.84      0.85     20016
weighted avg       0.86      0.86      0.86     20016



In [23]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
scores.mean()

0.8554976612416725

Since the scores are all lower even if they are still in the high range, it is best to use the entire feature set. Let's do GridSearch. 

In [10]:
X = Phishing_URL.drop("phishing",axis=1)
y = Phishing_URL["phishing"]
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
         'subsample': [0.8, 1.0]
    }
from sklearn.model_selection import GridSearchCV
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1) 
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
best_gb_model = grid_search.best_estimator_
y_pred = best_gb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy with best model: {test_accuracy:.4f}")
report = classification_report(y_test,y_pred)
print(report)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best cross-validation accuracy: 0.8580
Test set accuracy with best model: 0.8606
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     12698
           1       0.80      0.82      0.81      7318

    accuracy                           0.86     20016
   macro avg       0.85      0.85      0.85     20016
weighted avg       0.86      0.86      0.86     20016



No improvement with GridSearch. 