In [1]:
import pandas as pd
Phishing_Data = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\dataset_phishing.csv")
Phishing_Data

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,http://www.fontspace.com/category/blackletter,45,17,0,2,0,0,0,0,0,...,0,0,0,448,5396,3980,0,0,6,legitimate
11426,http://www.budgetbots.com/server.php/Server%20...,84,18,0,5,0,1,1,0,0,...,1,0,0,211,6728,0,0,1,0,phishing
11427,https://www.facebook.com/Interactive-Televisio...,105,16,1,2,6,0,1,0,0,...,0,0,0,2809,8515,8,0,1,10,legitimate
11428,http://www.mypublicdomainpictures.com/,38,30,0,2,0,0,0,0,0,...,1,0,0,85,2836,2455493,0,0,4,legitimate


In [4]:
X = Phishing_Data.drop(["url","status"],axis=1)
y = Phishing_Data["status"]
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9348206474190727

In [5]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.94      0.93      0.94      1157
    phishing       0.93      0.94      0.93      1129

    accuracy                           0.93      2286
   macro avg       0.93      0.93      0.93      2286
weighted avg       0.93      0.93      0.93      2286



In [6]:
from sklearn.model_selection import KFold, cross_val_score
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.932458442694663

Basically, the test accuracy, cross validation accuracy, precision, and recall scores are all in the low 90 percent range. In other words, very good. Let's do feature importance now to see which features contribute the most to the model.  

In [12]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance'])
df = df.sort_values(by='Importance', ascending=False)
df

Unnamed: 0,Importance
google_index,0.526742
page_rank,0.10134
nb_hyperlinks,0.096161
nb_www,0.025644
web_traffic,0.021616
nb_qm,0.019473
length_hostname,0.017428
domain_age,0.017276
phish_hints,0.010918
longest_words_raw,0.010785


Based on this, it is evident that the first nine features contribute to approximately 85% of the model. Let's see what the metrics are like with just these features. 

In [13]:
X = Phishing_Data[["google_index","page_rank","nb_hyperlinks","nb_www","web_traffic","nb_qm","length_hostname","domain_age","phish_hints"]]
y = Phishing_Data["status"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9326334208223972

In [14]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.93      0.93      0.93      1157
    phishing       0.93      0.93      0.93      1129

    accuracy                           0.93      2286
   macro avg       0.93      0.93      0.93      2286
weighted avg       0.93      0.93      0.93      2286



In [15]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.9320209973753281

As was expected, the model performs almost the same with just these eight features; the test and cross-validation accuracy as well as the precision and recall scores are still in the low 90 percent range. Now let's do GridSearchCV to see if we can improve these numbers. 

In [19]:
from sklearn.model_selection import GridSearchCV
import numpy as np
model = DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3, 15),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")

# Print the best score achieved
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best estimator (the Decision Tree model with the optimal hyperparameters)
best_dtree = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_dtree.score(X_test, y_test)
print(f"Test set accuracy with best estimator: {test_accuracy:.4f}")

y_pred = best_dtree.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best cross-validation score: 0.9380
Test set accuracy with best estimator: 0.9339
              precision    recall  f1-score   support

  legitimate       0.94      0.93      0.93      1157
    phishing       0.93      0.94      0.93      1129

    accuracy                           0.93      2286
   macro avg       0.93      0.93      0.93      2286
weighted avg       0.93      0.93      0.93      2286



There is a slight improvement as a result of gridsearch for test and cross validation accuracy scores. Now let's do Random Forest. 

In [22]:
X = Phishing_Data.drop(["url","status"],axis=1)
y = Phishing_Data["status"]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.968066491688539

In [23]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.96      0.97      0.97      1157
    phishing       0.97      0.96      0.97      1129

    accuracy                           0.97      2286
   macro avg       0.97      0.97      0.97      2286
weighted avg       0.97      0.97      0.97      2286



In [24]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.9643044619422572

Random Forest performs better than Decision Tree (not a complete surprise). The test accuracy, cross validation accuracy, precision and recall scores are in the high 90 percent range now. Let's now figure out which features contribute the most to the model. 

In [26]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance'])
df = df.sort_values(by='Importance', ascending=False)
df

Unnamed: 0,Importance
google_index,0.176132
page_rank,0.109951
web_traffic,0.085907
nb_hyperlinks,0.066709
nb_www,0.049964
ratio_extHyperlinks,0.031514
domain_age,0.030754
longest_word_path,0.026783
ratio_intHyperlinks,0.022808
ratio_digits_url,0.022631


Based on this, the top 19 features contribute to approximately 85% of the model. Let's run the model with just these features. 

In [28]:
X = Phishing_Data[["google_index","page_rank","web_traffic","nb_hyperlinks","nb_www","ratio_extHyperlinks","domain_age","longest_word_path","ratio_intHyperlinks","ratio_digits_url","links_in_tags","safe_anchor","phish_hints","ratio_extRedirection","length_url","length_hostname","avg_word_path","longest_words_raw","length_words_raw"]]
y = Phishing_Data["status"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy


0.9593175853018373

In [29]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.96      0.96      0.96      1157
    phishing       0.96      0.96      0.96      1129

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286



In [30]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.9589676290463693

There is a slight decrease in all metrics compared to the full feature set. This isn't a problem because these scores are still in the high 90 percent range. Let's now do gridsearch to figure out if we can get better scores. 

In [31]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],      # Maximum depth of the tree
    'min_samples_split': [2, 5],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],       # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]        # Whether bootstrap samples are used when building trees
}
grid_search = GridSearchCV(estimator= model, 
                           param_grid=param_grid, 
                           cv=5,                 # Number of cross-validation folds
                           n_jobs=-1,            # Use all available CPU cores
                           verbose=2,            # Display progress messages
                           scoring='accuracy')   # Metric to optimize

grid_search.fit(X_train, y_train)

print("Best parameters found by Grid Search:")
print(grid_search.best_params_)

print("\nBest cross-validation score:")
print(grid_search.best_score_)

best_rf_model = grid_search.best_estimator_
test_accuracy = best_rf_model.score(X_test, y_test)
print(f"\nAccuracy of the best model on the test set: {test_accuracy:.4f}")
y_pred = best_rf_model.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found by Grid Search:
{'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}

Best cross-validation score:
0.9586613315977811

Accuracy of the best model on the test set: 0.9593
              precision    recall  f1-score   support

  legitimate       0.96      0.96      0.96      1157
    phishing       0.96      0.96      0.96      1129

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286



Results are about the same with GridSearchCV. 

In [1]:
import pandas as pd
Email = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\Phishing_Email.csv")


In [2]:
X = Email.drop(["Email Type","Email Text","Unnamed: 0"],axis=1)
y = Email["Email Type"]
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

               

0.8058690744920993

In [3]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.75      0.79      0.77      1429
    Safe Email       0.85      0.82      0.84      2115

      accuracy                           0.81      3544
     macro avg       0.80      0.80      0.80      3544
  weighted avg       0.81      0.81      0.81      3544



In [4]:
from sklearn.model_selection import KFold, cross_val_score
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8053171325011197

Basically, the accuracy score for both testing and cross validation is in the low 80 percent range, leaving something to be desired. It is notable that the phishing email precision and recall scores are lower than those of safe emails, indicating that the model is not as good at classifying them. Let's see what happens when we set class weight equal to balance (there is an imbalance which favors safe emails). 

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier(class_weight = 'balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7957110609480813

In [8]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.73      0.77      0.75      1429
    Safe Email       0.84      0.81      0.82      2115

      accuracy                           0.79      3544
     macro avg       0.79      0.79      0.79      3544
  weighted avg       0.80      0.79      0.80      3544



In [9]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8014225583272647

No real improvement. If anything, the testing and cross validation accuracy have gone down. There is slight improvement for the phishing email recall score at the cost of a slight decrease in the precision score. Both safe email precision and recall scores are down. 

In [10]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance'])
df = df.sort_values(by='Importance', ascending=False)
df

Unnamed: 0,Importance
num_exclamations,0.196174
num_special_chars,0.144776
avg_word_len,0.132955
num_digits,0.123959
num_words,0.096668
num_sentences,0.096481
num_chars,0.079215
num_capitals,0.05472
num_urls,0.040047
num_question_marks,0.035005


Based on this, the top seven features explain approximately 87% percent of the model. Let's run the model with just these features. 

In [4]:
X = Email[["num_exclamations","num_special_chars","avg_word_len","num_digits","num_words","num_sentences","num_chars"]]
y = Email["Email Type"]
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier(class_weight='balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7889390519187359

In [15]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.73      0.76      0.74      1429
    Safe Email       0.83      0.80      0.82      2115

      accuracy                           0.79      3544
     macro avg       0.78      0.78      0.78      3544
  weighted avg       0.79      0.79      0.79      3544



In [16]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.7933508925175322

There is a slight decrease in test and cross validation accuracy scores, indicating that all of the features are essential for the best possible classification score. As before, phishing emails are less likely to be classified correctly than safe emails. Let's do gridsearch now to see if we can maximize the scores. 

In [17]:
from sklearn.model_selection import GridSearchCV
import numpy as np
model = DecisionTreeClassifier(class_weight = 'balanced')
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3, 15),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")

# Print the best score achieved
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best estimator (the Decision Tree model with the optimal hyperparameters)
best_dtree = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_dtree.score(X_test, y_test)
print(f"Test set accuracy with best estimator: {test_accuracy:.4f}")

y_pred = best_dtree.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

Best parameters: {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best cross-validation score: 0.8002
Test set accuracy with best estimator: 0.7892
                precision    recall  f1-score   support

Phishing Email       0.74      0.73      0.74      1429
    Safe Email       0.82      0.83      0.82      2115

      accuracy                           0.79      3544
     macro avg       0.78      0.78      0.78      3544
  weighted avg       0.79      0.79      0.79      3544



There is an improvement in both the cross-validation and testing accuracy scores. However, phishing emails are still less likely to be correctly classified than safe emails. Let's do Random Forest

In [5]:
X = Email.drop(["Email Type","Email Text","Unnamed: 0"],axis=1)
y = Email["Email Type"]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8611738148984198

In [21]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.83      0.81      0.82      1429
    Safe Email       0.88      0.89      0.88      2115

      accuracy                           0.86      3544
     macro avg       0.85      0.85      0.85      3544
  weighted avg       0.86      0.86      0.86      3544



In [22]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8633439446618105

As expected, Random Forest performs better than Decision Tree. The cross validation and test accuracy scores are in the mid 80 percent range. The gap between the precision and recall scores for phishing and safe emails still exists but phishing emails have better scores when compared to the Decision Tree models. Let's try setting class weight to balanced. 

In [23]:
X = Email.drop(["Email Type","Email Text","Unnamed: 0"],axis=1)
y = Email["Email Type"]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8591986455981941

In [24]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.84      0.81      0.82      1429
    Safe Email       0.87      0.89      0.88      2115

      accuracy                           0.86      3544
     macro avg       0.86      0.85      0.85      3544
  weighted avg       0.86      0.86      0.86      3544



In [25]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8624973479642879

The cross validation and test accuracy scores are almost the same as last time. The same goes for precision and recall scores for both phishing and safe emails. Let's find feature importance now. 

In [26]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance'])
df = df.sort_values(by='Importance', ascending=False)
df

Unnamed: 0,Importance
num_exclamations,0.190244
num_special_chars,0.137627
avg_word_len,0.125646
num_chars,0.112151
num_digits,0.110344
num_words,0.102262
num_sentences,0.097996
num_capitals,0.049733
num_urls,0.041725
num_question_marks,0.032272


The top 7 features make almost 88% of the model. Let's run the model using these features alone.  

In [6]:
X = Email[["num_exclamations","num_special_chars","avg_word_len","num_chars","num_digits","num_words","num_sentences"]]
y = Email["Email Type"]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8521444695259593

In [29]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.83      0.80      0.82      1429
    Safe Email       0.87      0.89      0.88      2115

      accuracy                           0.85      3544
     macro avg       0.85      0.85      0.85      3544
  weighted avg       0.85      0.85      0.85      3544



In [30]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8527884602519578

Just like with Decision Tree Classifier, all the metrics are lower but not drastically so. Let's try Grid Search now. 

In [7]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(class_weight='balanced')
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],      # Maximum depth of the tree
    'min_samples_split': [2, 5],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],       # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]        # Whether bootstrap samples are used when building trees
}
grid_search = GridSearchCV(estimator= model, 
                           param_grid=param_grid, 
                           cv=5,                 # Number of cross-validation folds
                           n_jobs=-1,            # Use all available CPU cores
                           verbose=2,            # Display progress messages
                           scoring='accuracy')   # Metric to optimize

grid_search.fit(X_train, y_train)

print("Best parameters found by Grid Search:")
print(grid_search.best_params_)

print("\nBest cross-validation score:")
print(grid_search.best_score_)

best_rf_model = grid_search.best_estimator_
test_accuracy = best_rf_model.score(X_test, y_test)
print(f"\nAccuracy of the best model on the test set: {test_accuracy:.4f}")
y_pred = best_rf_model.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found by Grid Search:
{'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Best cross-validation score:
0.8504083321820325

Accuracy of the best model on the test set: 0.8550
                precision    recall  f1-score   support

Phishing Email       0.83      0.80      0.82      1429
    Safe Email       0.87      0.89      0.88      2115

      accuracy                           0.85      3544
     macro avg       0.85      0.85      0.85      3544
  weighted avg       0.85      0.85      0.85      3544



The results are very similar to those shown above; all numbers are in the 80 percent range. 

In [8]:
import pandas as pd
Phishing_Web = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\web-page-phishing.csv")

In [9]:
X = Phishing_Web.drop(["phishing"],axis=1)
y = Phishing_Web["phishing"]
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8828936850519584

In [35]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.89      0.93      0.91     12698
           1       0.86      0.81      0.83      7318

    accuracy                           0.88     20016
   macro avg       0.88      0.87      0.87     20016
weighted avg       0.88      0.88      0.88     20016



In [37]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8847088075230547

The test and cross validation accuracy scores are in the high 80 percent range. However, there is a significant gap in recall scores for both phishing and legitimate webpages. Let's try setting class weight equal to balance now. 

In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier(class_weight = 'balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8826938449240608

In [39]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.89      0.91     12698
           1       0.82      0.87      0.84      7318

    accuracy                           0.88     20016
   macro avg       0.87      0.88      0.87     20016
weighted avg       0.88      0.88      0.88     20016



In [40]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.883110017107473

No real improvement. There is now a significant gap between precision scores for both phishing and legitimate webpages instead of between recall scores for both. Let's find feature importance next. 

In [41]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance'])
df = df.sort_values(by='Importance', ascending=False)
df

Unnamed: 0,Importance
n_slash,0.556546
url_length,0.300023
n_dots,0.047188
n_hypens,0.026566
n_redirection,0.026532
n_underline,0.014396
n_questionmark,0.009365
n_equal,0.007055
n_percent,0.005167
n_at,0.001301


The top three features contribute to approximately 90% of the model. Let's run the model with just these features. 

In [10]:
X = Phishing_Web[["n_slash","url_length","n_dots"]]
y = Phishing_Web["phishing"]
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = DecisionTreeClassifier(class_weight='balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.860861310951239

In [47]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.84      0.89     12698
           1       0.77      0.89      0.82      7318

    accuracy                           0.86     20016
   macro avg       0.85      0.87      0.85     20016
weighted avg       0.87      0.86      0.86     20016



In [48]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8610569980188718

With only the three features, test and cross validation accuracy scores are lower. They are still pretty high though (mid 80 percent range). There is a substantial gap between recall scores for both phishing and legitimate websites. Now, let's do gridsearch. 

In [49]:
from sklearn.model_selection import GridSearchCV
import numpy as np
model = DecisionTreeClassifier(class_weight = 'balanced')
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3, 15),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")

# Print the best score achieved
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best estimator (the Decision Tree model with the optimal hyperparameters)
best_dtree = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_dtree.score(X_test, y_test)
print(f"Test set accuracy with best estimator: {test_accuracy:.4f}")

y_pred = best_dtree.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

Best parameters: {'criterion': 'gini', 'max_depth': 14, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best cross-validation score: 0.8628
Test set accuracy with best estimator: 0.8626
              precision    recall  f1-score   support

           0       0.94      0.84      0.89     12698
           1       0.76      0.90      0.83      7318

    accuracy                           0.86     20016
   macro avg       0.85      0.87      0.86     20016
weighted avg       0.87      0.86      0.86     20016



There is some improvement in test and cross validation accuracy scores. The gap between precision scores for phishing and legitimate webpages is still substantial. 

In [11]:
X = Phishing_Web.drop(["phishing"],axis=1)
y = Phishing_Web["phishing"]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8907374100719424

In [3]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.92      0.91     12698
           1       0.86      0.84      0.85      7318

    accuracy                           0.89     20016
   macro avg       0.88      0.88      0.88     20016
weighted avg       0.89      0.89      0.89     20016



In [4]:
from sklearn.model_selection import KFold, cross_val_score
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8927825798317128

The testing and cross-validation accuracy scores are in the high 80 percent range to the point of being almost 90 percent. There is a significantly large disparity in recall scores between phishing and legitimate webpages. Let's set class weight equal to balanced now. 

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier(class_weight = 'balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8896882494004796

In [6]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.90      0.91     12698
           1       0.83      0.88      0.85      7318

    accuracy                           0.89     20016
   macro avg       0.88      0.89      0.88     20016
weighted avg       0.89      0.89      0.89     20016



In [7]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8897249170436321

No real improvement in cross validation and testing accuracy scores. There is now a significant disparity between precision scores for phishing and legitimate webpages. Let's find feature importance. 

In [8]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance'])
df = df.sort_values(by='Importance', ascending=False)
df

Unnamed: 0,Importance
url_length,0.390448
n_slash,0.383326
n_hypens,0.05954
n_dots,0.057695
n_equal,0.031752
n_redirection,0.026627
n_underline,0.019072
n_questionmark,0.008879
n_and,0.007656
n_percent,0.005391


The top 4 features represent approximately 89% of the model. Let's run the model with just these features. 

In [12]:
X = Phishing_Web[["url_length","n_slash","n_hypens","n_dots"]]
y = Phishing_Web["phishing"]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8715027977617905

In [11]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.86      0.89     12698
           1       0.79      0.89      0.83      7318

    accuracy                           0.87     20016
   macro avg       0.86      0.87      0.86     20016
weighted avg       0.88      0.87      0.87     20016



In [12]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=k_folds, scoring='accuracy')
scores.mean()

0.8723982602576446

The testing and cross validation accuracy scores are lower but still pretty high; they are in the high 80 percent range. There is still a significant disparity between precision scores for phishing and legitimate webpages. Now let's do gridsearch. 

In [14]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(class_weight='balanced')
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],      # Maximum depth of the tree
    'min_samples_split': [2, 5],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],       # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]        # Whether bootstrap samples are used when building trees
}
grid_search = GridSearchCV(estimator= model, 
                           param_grid=param_grid, 
                           cv=5,                 # Number of cross-validation folds
                           n_jobs=-1,            # Use all available CPU cores
                           verbose=2,            # Display progress messages
                           scoring='accuracy')   # Metric to optimize

grid_search.fit(X_train, y_train)

print("Best parameters found by Grid Search:")
print(grid_search.best_params_)

print("\nBest cross-validation score:")
print(grid_search.best_score_)

best_rf_model = grid_search.best_estimator_
test_accuracy = best_rf_model.score(X_test, y_test)
print(f"\nAccuracy of the best model on the test set: {test_accuracy:.4f}")
y_pred = best_rf_model.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found by Grid Search:
{'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

Best cross-validation score:
0.8745456434121669

Accuracy of the best model on the test set: 0.8739
              precision    recall  f1-score   support

           0       0.94      0.86      0.90     12698
           1       0.79      0.90      0.84      7318

    accuracy                           0.87     20016
   macro avg       0.86      0.88      0.87     20016
weighted avg       0.88      0.87      0.88     20016



There is an improvement in cross validation and test accuracy scores but still a significant gap between precision scores for phishing and legitimate webpages. 