## Title
- Boosting Algorithms for Phishing Websites Detection

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Loading phishing website dataset
pwd = pd.read_csv('../Final Project/Data/phishing.csv')

# Checking data shape
row, col = pwd.shape
print(f"There are {row} rows and {col} columns")

# Data at a glance
print(pwd.head(10))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Set x, y, and splitted data
x = pwd.drop('class', axis=1)
y = pwd['class']

# Process the data
sc_X = StandardScaler()
x = sc_X.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Train the model
classifier = AdaBoostClassifier()
classifier.fit(x_train,y_train)

In [None]:
# Make predictions
y_pred = classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

# Evaluate the performance on training data
acc_train = accuracy_score(y_train, classifier.predict(x_train))
f1_train = f1_score(y_train, classifier.predict(x_train), average= 'weighted')

print("Training results")
print("Accuracy ->", acc_train)
print("F1 Score ->", f1_train)

In [None]:
# Evaluate the performance on testing data
acc_test = accuracy_score(y_test, y_pred)
f1_test = f1_score(y_test, y_pred, average= 'weighted')

print("Test set results")
print("Accuracy ->",acc_test)
print("F1 Score ->",f1_test)

In [None]:
# Used for pretty printing of confusion matrices

import itertools

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    
    # Plot the confusion matrix
    # Normalization is applied
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalization = True")
    else:
        print('Normalization = False')

    # print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure()
plot_confusion_matrix(cm, classes=['Legitimate', 'Phishing'], title='Confusion matrix, without normalization')
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train the model
classifier_2 = GradientBoostingClassifier()
classifier_2.fit(x_train, y_train)

In [None]:
# Make predictions
y_pred_2 = classifier_2.predict(x_test)

In [None]:
# Evaluate the performance on training data
acc_train_2 = accuracy_score(y_train, classifier_2.predict(x_train))
f1_train_2 = f1_score(y_train, classifier_2.predict(x_train), average= 'weighted')

print("Training results")
print("Accuracy ->", acc_train_2)
print("F1 Score ->", f1_train_2)

In [None]:
# Evaluate the performance on testing data
acc_test_2 = accuracy_score(y_test, y_pred_2)
f1_test_2 = f1_score(y_test, y_pred_2, average= 'weighted')

print("Test set results")
print("Accuracy ->",acc_test_2)
print("F1 Score ->",f1_test_2)

In [None]:
# Confusion matrix
cm_2 = confusion_matrix(y_test, y_pred_2)

plt.figure()
plot_confusion_matrix(cm_2, classes=['Legitimate', 'Phishing'], title='Confusion matrix, without normalization')
plt.show()

In [None]:
from xgboost import XGBClassifier

# Train the model
classifier_1 = XGBClassifier()
classifier_1.fit(x_train, y_train)

In [None]:
# Make predictions
y_pred_1 = classifier_1.predict(x_test)

In [None]:
# Evaluate the performance on training data
acc_train_1 = accuracy_score(y_train, classifier_1.predict(x_train))
f1_train_1 = f1_score(y_train, classifier_1.predict(x_train), average= 'weighted')

print("Training results")
print("Accuracy ->", acc_train_1)
print("F1 Score ->", f1_train_1)

In [None]:
# Evaluate the performance on testing data
acc_test_1 = accuracy_score(y_test, y_pred_1)
f1_test_1 = f1_score(y_test, y_pred_1, average= 'weighted')

print("Test set results")
print("Accuracy ->",acc_test_1)
print("F1 Score ->",f1_test_1)

In [None]:
# Confusion matrix
cm_1 = confusion_matrix(y_test, y_pred_1)

plt.figure()
plot_confusion_matrix(cm_1, classes=['Legitimate', 'Phishing'], title='Confusion matrix, without normalization')
plt.show()

In [None]:
d = {'Index Title': ['AdaBoost', 'Gradient Boosting', 'XG Boost'], 'Train Acc': [acc_train, acc_train_2, acc_train_1], 
    'Test Acc': [acc_test, acc_test_2, acc_test_1]}

df = pd.DataFrame(d).set_index('Index Title')

print(df)

fig, ax = plt.subplots(figsize=(7,4))
df.plot(kind='barh', legend = True, ax=ax)
ax.set_xlabel('Accuracy')
ax.set_ylabel('Methods')

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))