## Combined models
1) rf-benign-specialize
2) rf-general
3) rf-lexical

In [39]:
import os
import pickle

import numpy as np
import pandas as pd 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split


In [40]:
def calc_FNR_accuracy(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    for label_class in range(4):
        FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
        
        TP = conf_matrix[label_class][label_class]  
        
        TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
        
        accuracy = (TP + TN) / np.sum(conf_matrix)
        print("Accuracy for class", label_class, ":", accuracy)

        FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
        print("FNR for class", label_class, ":", FNR)

In [41]:
# Load models
model_files = [file for file in os.listdir(os.getcwd()) if file.endswith(".pkl")]
models = []
for file in model_files:
    with open(file, "rb") as file:
        models.append((file.name, pickle.load(file)))

[('rf-general.pkl', RandomForestClassifier(max_depth=20, n_estimators=50, random_state=69)), ('rf-lexical.pkl', RandomForestClassifier(max_depth=20, n_estimators=50, random_state=69)), ('rf-benign-specialize.pkl', RandomForestClassifier(max_depth=20, n_estimators=50, random_state=69))]


In [42]:
# Data for training ensemble model
training_dataset = {'rf-benign-specialize.pkl': 'rf-benign-specialize-features.csv',
                    'rf-general.pkl': 'rf-general-features.csv',
                    'rf-lexical.pkl': 'rf-lexical-features.csv'}

# X_train = pd.read_csv("../datasets/malicious_phish.csv")
y_train = pd.read_csv("../datasets/feature_updated_dataset_y.csv")
y_train = y_train.values.ravel()

# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=69)

In [43]:
# Prepare predictions from models
for i in range(len(models)):
    model_name, model = models[i]
    training_data = pd.read_csv(training_dataset[model_name])
    print(model_name, training_data)
    y_pred = model.predict(training_data) # prediction for model


rf-general.pkl         www  url_length  digit_count  dot_count  bs_count  dash_count  \
0       0.0        30.0          8.0       30.0       3.0         0.0   
1       0.0        16.0          0.0       16.0       0.0         1.0   
2       0.0        35.0          1.0       35.0       2.0         0.0   
3       0.0        31.0          1.0       31.0       3.0         0.0   
4       1.0        88.0          7.0       88.0       3.0         1.0   
...     ...         ...          ...        ...       ...         ...   
651186  0.0        39.0         12.0       39.0       3.0         0.0   
651187  0.0        44.0          7.0       44.0       4.0         2.0   
651188  1.0        42.0          3.0       42.0       4.0         0.0   
651189  0.0        45.0          0.0       45.0       2.0         0.0   
651190  1.0        41.0          0.0       41.0       3.0         0.0   

        url_entropy  params_count  subdomain_count  domain_extension  ...  \
0         -4.215061           0



rf-lexical.pkl         url  url.1     url.2  url.3  url.4  url.5  url.6  url.7  url.8  url.9  \
0        30      0  0.444444      0     11      0      1      0      0      0   
1        16      0  0.000000      0     16      0      3      1      0      0   
2        35      1  0.034483      0     11      1      1      0      0      0   
3        31      0  0.040000      0     14      0      1      0      0      0   
4        88     10  0.111111      0     17      0      2      1      0      0   
...     ...    ...       ...    ...    ...    ...    ...    ...    ...    ...   
651186   39      0  0.571429      0      7      0      1      0      0      0   
651187   44      0  0.241379      0     12      0      1      0      0      0   
651188   42      0  0.090909      0     12      0      1      0      0      0   
651189   45      3  0.000000      0     13      0      1      0      0      0   
651190   41      0  0.000000      0     13      0      1      0      0      0   

        ... 

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- url.1
- url.10
- url.11
- url.12
- url.13
- ...


In [None]:
# Attached to endpoint prediction of above models
ensemble_model = MLPClassifier(hidden_layer_sizes=(3, 3))



### Test on 0.2 validation split initial dataset

### Test on phishing dataset

### Test on benign dataset