### Predictions

In [8]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif 
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support


In [9]:
column_names = ['url_entropy', 'domain_extension', 'bs_count']
X_train_path = "feature_updated_dataset_X_3.csv"
y_train_path = "../datasets/feature_updated_dataset_y.csv"
benign_dataset_path = "benign_dataset_X_3.csv"
phishing_dataset_path = "phishing_dataset_X_3.csv"

def calc_FNR_accuracy(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    for label_class in range(4):
        FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
        
        TP = conf_matrix[label_class][label_class]  
        
        TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
        
        accuracy = (TP + TN) / np.sum(conf_matrix)
        print("Accuracy for class", label_class, ":", accuracy)

        FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
        print("FNR for class", label_class, ":", FNR)

### Selected feature evaluation - Malicious URLs

In [10]:
X_train = pd.read_csv(X_train_path)
y_train = pd.read_csv(y_train_path)
y_train = y_train.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=69)

scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__}")

Accuracy: 0.8238853185297799 | F1 score: 0.8100919430301454 | RandomForestClassifier


In [11]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
val_accuracy = accuracy_score(y_test, y_pred)
print(f'OVERALL: Accuracy: {val_accuracy:.8f}, Precision: {precision:.8f}, Recall: {recall:.8f}, F1 Score: {f1:.8f}')

class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')
calc_FNR_accuracy(y_test, y_pred)

OVERALL: Accuracy: 0.82388532, Precision: 0.82011702, Recall: 0.82388532, F1 Score: 0.81009194
Class 0:	Test Precision: 0.82703842,	Test Recall: 0.95308815,	Test f1: 0.88560053
Class 1:	Test Precision: 0.79590809,	Test Recall: 0.65589334,	Test f1: 0.71914906
Class 2:	Test Precision: 0.76837505,	Test Recall: 0.41326994,	Test f1: 0.53746483
Class 3:	Test Precision: 0.94786815,	Test Recall: 0.80239612,	Test f1: 0.86908673
Accuracy for class 0 : 0.8380669384746504
FNR for class 0 : 0.04691185055458261
Accuracy for class 1 : 0.9241778576309707
FNR for class 1 : 0.34410666113301513
Accuracy for class 2 : 0.8977648784158355
FNR for class 2 : 0.5867300603664726
Accuracy for class 3 : 0.987760962538103
FNR for class 3 : 0.19760388231725812


### Selected feature evaluation - External Datasets

In [12]:
def evaluate(X_test_path, label):
    X_train = pd.read_csv(X_train_path)
    y_train = pd.read_csv(y_train_path)
    X_test = pd.read_csv(X_test_path)
    y_train = y_train.values.ravel()

    scaler = MaxAbsScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    y_pred = model.predict(X_test)
    y_test = np.full((X_test.shape[0],), label)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__}")
    print(np.count_nonzero(y_pred == 0)/len(y_pred))
    print(np.count_nonzero(y_pred == 1)/len(y_pred))
    print(np.count_nonzero(y_pred == 2)/len(y_pred))
    print(np.count_nonzero(y_pred == 3)/len(y_pred))

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    val_accuracy = accuracy_score(y_test, y_pred)
    print(f'OVERALL: Accuracy: {val_accuracy:.8f}, Precision: {precision:.8f}, Recall: {recall:.8f}, F1 Score: {f1:.8f}')

    class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)
    for i in range(4):
        print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')
    calc_FNR_accuracy(y_test, y_pred)

evaluate(benign_dataset_path, 0) # Benign
evaluate(phishing_dataset_path, 2) # Phishing

Accuracy: 0.8125227773632057 | F1 score: 0.8965655908007238 | RandomForestClassifier
0.8125227773632057
0.14411490782037265
0.03749660147279153
0.005865713343630148
OVERALL: Accuracy: 0.81252278, Precision: 1.00000000, Recall: 0.81252278, F1 Score: 0.89656559
Class 0:	Test Precision: 1.00000000,	Test Recall: 0.81252278,	Test f1: 0.89656559
Class 1:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 2:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 3:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Accuracy for class 0 : 0.8125227773632057
FNR for class 0 : 0.18747722263679434
Accuracy for class 1 : 0.8558850921796274
FNR for class 1 : -1
Accuracy for class 2 : 0.9625033985272085
FNR for class 2 : -1
Accuracy for class 3 : 0.9941342866563698
FNR for class 3 : -1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.14646941736108768 | F1 score: 0.25551386743176635 | RandomForestClassifier
0.29457508181235453
0.5508923450625822
0.14646941736108768
0.008063155763975574
OVERALL: Accuracy: 0.14646942, Precision: 1.00000000, Recall: 0.14646942, F1 Score: 0.25551387
Class 0:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 1:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 2:	Test Precision: 1.00000000,	Test Recall: 0.14646942,	Test f1: 0.25551387
Class 3:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Accuracy for class 0 : 0.7054249181876455
FNR for class 0 : -1
Accuracy for class 1 : 0.4491076549374178
FNR for class 1 : -1
Accuracy for class 2 : 0.14646941736108768
FNR for class 2 : 0.8535305826389123
Accuracy for class 3 : 0.9919368442360245
FNR for class 3 : -1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# Save model
model_name = "rf-minimal"
with open(f"../ensemble/{model_name}.pkl", "wb") as f:
    pickle.dump(model, f)

### Feature selection

In [14]:
# column_names = ['www', 'url_length', 'digit_count', 'percentage_count', 'dot_count',
#                 'bs_count', 'dash_count', 'url_entropy', 'params_count', 'subdomain_count',
#                 'domain_extension', 'semicolon_count', 'underscores_count', 'questionmarks_count', 'equals_count', 'ampersands_count', 'digit_letter_ratio', 
                
#                 'pd_num_count', 'pd_non_alphanumeric_count', 'pd_at_count', 'pd_hyphen_count', 'pd_in_alex_top_1m',
                
#                 'path_double_slash_count', 'percent20_presence', 'uppercase_dirs', 'single_char_dirs', 'path_count_special_chars',
#                 'path_zeroes_count', 'path_uppercase_to_lowercase_ratio', 'params_length', 'queries_count']

# for i in range(1, X_train.shape[1] + 1):
#     # Compare current score with previous score and filter out those below previous_score
#     best_f1 = 0
#     best_features = []

#     model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)
#     select_feature_model = SelectKBest(mutual_info_classif, k=i)
#     X_train_partition = select_feature_model.fit_transform(X_train, y_train)
#     model.fit(X_train_partition, y_train)
#     X_test_partition = select_feature_model.transform(X_test)
#     benign_pred = model.predict(X_test_partition)

#     indexes = select_feature_model.get_support(indices=True)
#     features_used = np.array(column_names)[indexes]
    
#     # Verifying model fit
#     y_test = np.zeros((345738,))
#     accuracy = accuracy_score(y_test, benign_pred)
#     f1 = f1_score(y_test, benign_pred, average='weighted')
#     if f1 > best_f1:
#         best_f1 = f1
#         best_features = features_used

#     print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__} | Number of features: {i} | Features: {features_used}")
#     print(np.count_nonzero(benign_pred == 0)/len(benign_pred))
#     print(np.count_nonzero(benign_pred == 1)/len(benign_pred))
#     print(np.count_nonzero(benign_pred == 2)/len(benign_pred))
#     print(np.count_nonzero(benign_pred == 3)/len(benign_pred))

# print(f"Best features: {best_features} | F1: {best_f1}")