### Predictions

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif 
from sklearn.preprocessing import MaxAbsScaler
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support


In [8]:
column_names = ['www', 'url_length', 'digit_count', 'dot_count',
                'bs_count', 'dash_count', 'url_entropy', 'params_count', 'subdomain_count',
                'domain_extension', 'underscores_count', 'questionmarks_count', 'equals_count', 'ampersands_count', 'digit_letter_ratio', 
                
                'pd_num_count', 'pd_non_alphanumeric_count', 
                
                'uppercase_dirs', 'path_count_special_chars',
                'path_uppercase_to_lowercase_ratio', 'params_length', 'queries_count']

X_train_path = "feature_updated_dataset_X.csv"
y_train_path = "../datasets/feature_updated_dataset_y.csv"
benign_dataset_path = "benign_dataset_X.csv"
phishing_dataset_path = "phishing_dataset_X.csv"

def calc_FNR_accuracy(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    for label_class in range(4):
        FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
        
        TP = conf_matrix[label_class][label_class]  
        
        TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
        
        accuracy = (TP + TN) / np.sum(conf_matrix)
        print("Accuracy for class", label_class, ":", accuracy)

        FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
        print("FNR for class", label_class, ":", FNR)

### Selected feature evaluation - Malicious URLs

In [9]:
X_train = pd.read_csv(X_train_path)
y_train = pd.read_csv(y_train_path)
y_train = y_train.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=69)

scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__}")

Accuracy: 0.9506752969540614 | F1 score: 0.9496420710643027 | RandomForestClassifier


In [10]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
val_accuracy = accuracy_score(y_test, y_pred)
print(f'OVERALL: Accuracy: {val_accuracy:.8f}, Precision: {precision:.8f}, Recall: {recall:.8f}, F1 Score: {f1:.8f}')

class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')
calc_FNR_accuracy(y_test, y_pred)

OVERALL: Accuracy: 0.95067530, Precision: 0.95027579, Recall: 0.95067530, F1 Score: 0.94964207
Class 0:	Test Precision: 0.95217465,	Test Recall: 0.98791594,	Test f1: 0.96971607
Class 1:	Test Precision: 0.96066077,	Test Recall: 0.94127412,	Test f1: 0.95086864
Class 2:	Test Precision: 0.91526348,	Test Recall: 0.80725466,	Test f1: 0.85787278
Class 3:	Test Precision: 0.99464614,	Test Recall: 0.90157719,	Test f1: 0.94582770
Accuracy for class 0 : 0.9594207572232588
FNR for class 0 : 0.012084063047285465
Accuracy for class 1 : 0.9856033906894248
FNR for class 1 : 0.05872587673791243
Accuracy for class 2 : 0.96155529449704
FNR for class 2 : 0.19274533896041454
Accuracy for class 3 : 0.9947711514983991
FNR for class 3 : 0.09842280861389141


### Selected feature evaluation - External Datasets

In [11]:
def evaluate(X_test_path, label):
    X_train = pd.read_csv(X_train_path)
    y_train = pd.read_csv(y_train_path)
    X_test = pd.read_csv(X_test_path)
    y_train = y_train.values.ravel()

    scaler = MaxAbsScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_test = np.full((X_test.shape[0],), label)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__}")
    print(np.count_nonzero(y_pred == 0)/len(y_pred))
    print(np.count_nonzero(y_pred == 1)/len(y_pred))
    print(np.count_nonzero(y_pred == 2)/len(y_pred))
    print(np.count_nonzero(y_pred == 3)/len(y_pred))

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    val_accuracy = accuracy_score(y_test, y_pred)
    print(f'OVERALL: Accuracy: {val_accuracy:.8f}, Precision: {precision:.8f}, Recall: {recall:.8f}, F1 Score: {f1:.8f}')

    class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_test, y_pred)
    for i in range(4):
        print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')
    calc_FNR_accuracy(y_test, y_pred)

evaluate(benign_dataset_path, 0) # Benign
evaluate(phishing_dataset_path, 2) # Phishing

Accuracy: 0.03275890992601334 | F1 score: 0.06343960746532834 | RandomForestClassifier
0.03275890992601334
0.45704550844859404
0.5079684616675054
0.0022271199578871863
OVERALL: Accuracy: 0.03275891, Precision: 1.00000000, Recall: 0.03275891, F1 Score: 0.06343961
Class 0:	Test Precision: 1.00000000,	Test Recall: 0.03275891,	Test f1: 0.06343961
Class 1:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 2:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 3:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Accuracy for class 0 : 0.03275890992601334
FNR for class 0 : 0.9672410900739866
Accuracy for class 1 : 0.542954491551406
FNR for class 1 : -1
Accuracy for class 2 : 0.4920315383324945
FNR for class 2 : -1
Accuracy for class 3 : 0.9977728800421128
FNR for class 3 : -1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8591140649775648 | F1 score: 0.9242187783544442 | RandomForestClassifier
0.09869774973853783
0.03942174690462535
0.8591140649775648
0.0027664383792719545
OVERALL: Accuracy: 0.85911406, Precision: 1.00000000, Recall: 0.85911406, F1 Score: 0.92421878
Class 0:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 1:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Class 2:	Test Precision: 1.00000000,	Test Recall: 0.85911406,	Test f1: 0.92421878
Class 3:	Test Precision: 0.00000000,	Test Recall: 0.00000000,	Test f1: 0.00000000
Accuracy for class 0 : 0.9013022502614622
FNR for class 0 : -1
Accuracy for class 1 : 0.9605782530953747
FNR for class 1 : -1
Accuracy for class 2 : 0.8591140649775648
FNR for class 2 : 0.14088593502243513
Accuracy for class 3 : 0.9972335616207281
FNR for class 3 : -1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Feature selection

In [12]:
# column_names = ['www', 'url_length', 'digit_count', 'percentage_count', 'dot_count',
#                 'bs_count', 'dash_count', 'url_entropy', 'params_count', 'subdomain_count',
#                 'domain_extension', 'semicolon_count', 'underscores_count', 'questionmarks_count', 'equals_count', 'ampersands_count', 'digit_letter_ratio', 
                
#                 'pd_num_count', 'pd_non_alphanumeric_count', 'pd_at_count', 'pd_hyphen_count', 'pd_in_alex_top_1m',
                
#                 'path_double_slash_count', 'percent20_presence', 'uppercase_dirs', 'single_char_dirs', 'path_count_special_chars',
#                 'path_zeroes_count', 'path_uppercase_to_lowercase_ratio', 'params_length', 'queries_count']

# X_train = pd.read_csv(X_train_path)
# y_train = pd.read_csv(y_train_path)
# y_train = y_train.values.ravel()

# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=69)

# scaler = MaxAbsScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# for i in range(1, X_train.shape[1] + 1):
#     # Compare current score with previous score and filter out those below previous_score
#     best_f1 = 0
#     best_features = []

#     model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)
#     select_feature_model = SelectKBest(mutual_info_classif, k=i)
#     X_train_partition = select_feature_model.fit_transform(X_train, y_train)
#     model.fit(X_train_partition, y_train)
#     X_test_partition = select_feature_model.transform(X_test)
#     y_pred = model.predict(X_test_partition)

#     indexes = select_feature_model.get_support(indices=True)
#     features_used = np.array(column_names)[indexes]
    
#     # Verifying model fit
#     accuracy = accuracy_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred, average='weighted')
#     if f1 > best_f1:
#         best_f1 = f1
#         best_features = features_used

#     print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__} | Number of features: {i} | Features: {features_used}")
#     print(np.count_nonzero(y_pred == 0)/len(y_pred))
#     print(np.count_nonzero(y_pred == 1)/len(y_pred))
#     print(np.count_nonzero(y_pred == 2)/len(y_pred))
#     print(np.count_nonzero(y_pred == 3)/len(y_pred))

# print(f"Best features: {best_features} | F1: {best_f1}")