## Combined models
1) rf-benign-specialize
2) rf-general
3) rf-lexical

In [115]:
import os
import pickle

import numpy as np
import pandas as pd 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split


In [116]:
def classification_type(type):
    '''
    Convert classification type into values:
        1) Benign = 0
        2) Defacement = 1
        3) Phishing = 2
        4) Malware = 3    
    '''
    if type == "benign":
        return 0
    elif type == "defacement":
        return 1
    elif type == "phishing":
        return 2
    elif type == "malware":
        return 3
    else:
        print(f"Unable to find proper type: {type}")


def calc_FNR_accuracy(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    for label_class in range(4):
        FN = sum(conf_matrix[label_class][i] for i in range(len(conf_matrix)) if i != label_class)  
        
        TP = conf_matrix[label_class][label_class]  
        
        TN = np.sum(np.delete(np.delete(conf_matrix, label_class, axis=0), label_class, axis=1))
        
        accuracy = (TP + TN) / np.sum(conf_matrix)
        print("Accuracy for class", label_class, ":", accuracy)

        FNR = FN / (FN + TP) if (FN + TP) > 0 else -1
        print("FNR for class", label_class, ":", FNR)

In [117]:
# Load models
model_files = [file for file in os.listdir(os.getcwd()) if file.endswith(".pkl")]
models = []
for file in model_files:
    with open(file, "rb") as file:
        models.append((file.name, pickle.load(file)))

In [118]:
# Data for training ensemble model
training_dataset = {'rf-benign-specialize.pkl': 'rf-benign-specialize-features.csv',
                    'rf-general.pkl': 'rf-general-features.csv',
                    'rf-lexical.pkl': 'rf-lexical-features.csv'}

df = pd.read_csv('../datasets/malicious_phish.csv')
y = df.iloc[:, 1]
y = y.apply(classification_type)
print(y.shape)

(651191,)


In [119]:
# Prepare predictions from models
for i in range(len(models)):
    model_name, model = models[i]
    print(model_name)
    training_data = pd.read_csv(training_dataset[model_name])
    print(training_data.shape, y.shape)
    X_train, _, y_train, _ = train_test_split(training_data, y, test_size=0.2, random_state=69)

    y_pred = model.predict(X_train) # prediction for model
    print(y_pred.shape)


rf-general.pkl
(651191, 22) (651191,)




(520952,)
rf-lexical.pkl
(651191, 21) (651191,)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- url.1
- url.10
- url.11
- url.12
- url.13
- ...


In [None]:
# Attached to endpoint prediction of above models
ensemble_model = MLPClassifier(hidden_layer_sizes=(3, 3))



### Test on 0.2 validation split initial dataset

### Test on phishing dataset

### Test on benign dataset