In [1]:
import pandas as pd
Phishing_Data = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\dataset_phishing.csv")
Phishing_Data

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,http://www.fontspace.com/category/blackletter,45,17,0,2,0,0,0,0,0,...,0,0,0,448,5396,3980,0,0,6,legitimate
11426,http://www.budgetbots.com/server.php/Server%20...,84,18,0,5,0,1,1,0,0,...,1,0,0,211,6728,0,0,1,0,phishing
11427,https://www.facebook.com/Interactive-Televisio...,105,16,1,2,6,0,1,0,0,...,0,0,0,2809,8515,8,0,1,10,legitimate
11428,http://www.mypublicdomainpictures.com/,38,30,0,2,0,0,0,0,0,...,1,0,0,85,2836,2455493,0,0,4,legitimate


In [2]:
X = Phishing_Data.drop(["url","status"],axis=1)
y = Phishing_Data["status"]


Run the SVC model with a linear kernel. 

In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = SVC(kernel="linear")
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9588801399825022

In [4]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.96      0.96      0.96      1157
    phishing       0.96      0.95      0.96      1129

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286



In [5]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.9425855981853267

Basically, when SVC is trained with the linear kernel on standardized data, the accuracy, precision, and recall scores are all in the high 90s. The cross validation score is somewhat lower but still pretty high. Now, let's do SVC with its default kernel setting (radial basis function or rbf).

In [6]:
model = SVC()
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.962817147856518

In [7]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

  legitimate       0.96      0.97      0.96      1157
    phishing       0.96      0.96      0.96      1129

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286



In [8]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.9534124421399458

Results are somewhat higher with the default kernel settings. This applies even to the cross-validation score. Let's see what happens when we do hyperparameter tuning via GridSearchCV. 

In [8]:
from sklearn.model_selection import GridSearchCV
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto', 0.1, 1]  # Kernel coefficient for 'rbf'
}
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")

# Print the best score achieved during cross-validation
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best estimator (the SVM model with the optimal hyperparameters)
best_svm_model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_svm_model.score(X_test_scaled, y_test)
print(f"Test set accuracy of the best model: {test_accuracy:.4f}")
y_pred = best_svm_model.predict(X_test_scaled)
print("\nClassification Report for the best SVM model:")
print(classification_report(y_test, y_pred))



Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation score: 0.9573
Test set accuracy of the best model: 0.9694

Classification Report for the best SVM model:
              precision    recall  f1-score   support

  legitimate       0.97      0.97      0.97      1157
    phishing       0.97      0.97      0.97      1129

    accuracy                           0.97      2286
   macro avg       0.97      0.97      0.97      2286
weighted avg       0.97      0.97      0.97      2286



Grid Search gave an improvement of about 1% in the test accuracy metric and about 2% for the cross-validation metric. The precision and recall scores are equal to 0.97. Of course, it is important to bear in mind that the metrics were already pretty good pre-GridSearchCV so there wasn't as much room for improvement. 

In [9]:
import pandas as pd
Email = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\Phishing_Email.csv")
Email

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,num_chars,num_words,avg_word_len,num_sentences,num_capitals,num_exclamations,num_question_marks,num_special_chars,num_digits,num_urls
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,1030.0,230.0,3.482609,9.0,0.0,2.0,0.0,59.0,9.0,0.0
1,1,the other side of * galicismos * * galicismo *...,Safe Email,479.0,91.0,4.274725,6.0,0.0,0.0,2.0,16.0,0.0,0.0
2,2,re : equistar deal tickets are you still avail...,Safe Email,1245.0,305.0,3.085246,7.0,0.0,0.0,1.0,95.0,63.0,0.0
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,688.0,96.0,5.500000,38.0,39.0,1.0,1.0,110.0,29.0,1.0
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,441.0,91.0,3.857143,13.0,0.0,0.0,0.0,27.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17711,18646,date a lonely housewife always wanted to date ...,Phishing Email,237.0,45.0,4.288889,6.0,0.0,0.0,1.0,8.0,0.0,0.0
17712,18647,request submitted : access request for anita ....,Safe Email,477.0,99.0,3.828283,8.0,0.0,0.0,0.0,31.0,24.0,0.0
17713,18648,"re : important - prc mtg hi dorn & john , as y...",Safe Email,1214.0,253.0,3.802372,13.0,0.0,1.0,1.0,38.0,0.0,0.0
17714,18649,press clippings - letter on californian utilit...,Safe Email,213.0,34.0,5.294118,0.0,0.0,0.0,0.0,8.0,0.0,0.0


In [10]:
X = Email.drop(["Unnamed: 0","Email Text","Email Type"],axis = 1)
y = Email["Email Type"]

Follow the same procedure as with the original dataset. 

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = SVC(kernel="linear")
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7079571106094809

In [14]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.80      0.37      0.50      1429
    Safe Email       0.69      0.94      0.79      2115

      accuracy                           0.71      3544
     macro avg       0.74      0.65      0.65      3544
  weighted avg       0.73      0.71      0.68      3544



In [15]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.7233273465689367

The results left quite a bit to be desired. Accuracy is less than 71%, the cross validation score is around 72%, and while the precision sscore for phishing emails is higher (80%) than for safe emails (69%), the recall score of the former (37%) is much lower than that of the latter (94%). In other words, there were a significant number of misclassifications. 

In [18]:
model = SVC()
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7895033860045146

In [19]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.84      0.59      0.70      1429
    Safe Email       0.77      0.92      0.84      2115

      accuracy                           0.79      3544
     macro avg       0.80      0.76      0.77      3544
  weighted avg       0.80      0.79      0.78      3544



In [20]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.7972056621597906

The results are better when the default kernel is used as opposed to the linear kernel but they are still much lower than for the original dataset. Let's try adding class_weight = "balanced" paramater to account for the class imbalance (safe emails outnumber phishing)

In [22]:
model = SVC(class_weight='balanced')
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7965575620767494

In [23]:
report = classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Phishing Email       0.73      0.79      0.76      1429
    Safe Email       0.85      0.80      0.82      2115

      accuracy                           0.80      3544
     macro avg       0.79      0.79      0.79      3544
  weighted avg       0.80      0.80      0.80      3544



In [24]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.7934647434341624

Accuracy is somewhat improved. The recall score for phishing emails is significantly better but at the cost of precision. The opposite trend is evident for safe emails. Overall, many emails are still incorrectly classified. Let's see what happens when we apply GridSearchCV. 

In [12]:
from sklearn.model_selection import GridSearchCV
svm = SVC(class_weight='balanced')
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto', 0.1, 1]  # Kernel coefficient for 'rbf'
}
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")

# Print the best score achieved during cross-validation
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best estimator (the SVM model with the optimal hyperparameters)
best_svm_model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_svm_model.score(X_test_scaled, y_test)
print(f"Test set accuracy of the best model: {test_accuracy:.4f}")
y_pred = best_svm_model.predict(X_test_scaled)
print("\nClassification Report for the best SVM model:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Best cross-validation score: 0.8243
Test set accuracy of the best model: 0.8220

Classification Report for the best SVM model:
                precision    recall  f1-score   support

Phishing Email       0.79      0.77      0.78      1429
    Safe Email       0.84      0.86      0.85      2115

      accuracy                           0.82      3544
     macro avg       0.82      0.81      0.81      3544
  weighted avg       0.82      0.82      0.82      3544



The test accuracy and cross-validation scores are now in the 80 percent range. The precision score for phishing emails is better while the recall is somewhat worse. The opposite is true for safe emails. While there is still something to be desired since around 20% of emails are misclassified, the fact that the accuracy and cross validation metrics are now in the 80 percent range is encouraging news. 

In [13]:
import pandas as pd
Web = pd.read_csv("C:\\Users\\alfre_g2qn6y7\\OneDrive\\Documents\\web-page-phishing.csv")
Web

Unnamed: 0,url_length,n_dots,n_hypens,n_underline,n_slash,n_questionmark,n_equal,n_at,n_and,n_exclamation,n_space,n_tilde,n_comma,n_plus,n_asterisk,n_hastag,n_dollar,n_percent,n_redirection,phishing
0,37,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,77,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,126,4,1,2,0,1,3,0,2,0,0,0,0,0,0,0,0,0,1,1
3,18,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,55,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100072,23,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100073,34,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
100074,70,2,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
100075,28,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [14]:
X = Web.drop(["phishing"],axis=1)
y = Web["phishing"]

Do same procedure as for the previous dataset. 

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = SVC(kernel="linear")
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8574640287769785

In [5]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.85      0.94      0.89     12698
           1       0.87      0.72      0.79      7318

    accuracy                           0.86     20016
   macro avg       0.86      0.83      0.84     20016
weighted avg       0.86      0.86      0.85     20016



In [29]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.857708475029165

While the accuracy, precision, recall, and cross-validation scores are all significantly better than for the previous dataset, there is still room for improvement. 

In [30]:
model = SVC()
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8835931254996003

In [32]:
report = classification_report(y_pred,y_test)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.90      0.91     13086
           1       0.81      0.86      0.84      6930

    accuracy                           0.88     20016
   macro avg       0.87      0.88      0.87     20016
weighted avg       0.89      0.88      0.88     20016



In [33]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.8803412210092416

There is some improvement in accuracy and the cross validation score when default setting for the kernel are used. However, both phishing and legitimate webpages have increased recall at the price of reduced precision. Let's try class_weight equals balance to account for class imbalance (legitimate class outnumbers phishing class). This should hopefully significantly increase performance. 

In [34]:
model = SVC(class_weight='balanced')
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8805955235811351

In [35]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.87      0.90     12698
           1       0.80      0.90      0.85      7318

    accuracy                           0.88     20016
   macro avg       0.87      0.88      0.87     20016
weighted avg       0.89      0.88      0.88     20016



In [36]:
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=k_folds, scoring='accuracy')
scores.mean()

0.8764692132246596

Interestingly enough, performance actually somewhat declined for both test and cross validation accuracy when class weight was set to balanced. Precision and recall scores were higher for phishing webpages while for legitimate webpages, there was an increase in precision score and a decrease in recall score. In other words, class balance doesn't seem to matter so much. Let's try GridSearchCV now. 

In [20]:
from sklearn.model_selection import GridSearchCV
svm = SVC(class_weight='balanced')
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.1, 1]  # Kernel coefficient for 'rbf'
}
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# Fit GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")

# Print the best score achieved during cross-validation
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best estimator (the SVM model with the optimal hyperparameters)
best_svm_model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_svm_model.score(X_test_scaled, y_test)
print(f"Test set accuracy of the best model: {test_accuracy:.4f}")
y_pred = best_svm_model.predict(X_test_scaled)
print("\nClassification Report for the best SVM model:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'C': 10, 'gamma': 1}
Best cross-validation score: 0.8830
Test set accuracy of the best model: 0.8861

Classification Report for the best SVM model:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91     12698
           1       0.81      0.89      0.85      7318

    accuracy                           0.89     20016
   macro avg       0.87      0.89      0.88     20016
weighted avg       0.89      0.89      0.89     20016



Based on the results above, there is some improvement in the test accuracy and cross validation accuracy score compared to the previous result (pre-Grid Search, class balancing). There is a slight improvment in recall for phishing websites and a corresponding decrease in precision. For legitimate websites, there is a slight increase in recall and no change for precision. The percentages are all in the high 80 percent range though, indicating that there aren't proportionally as many misclassifications compared to the previous dataset (Phishing Email). 